In [2]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("Amazon_Unlocked_Mobile.csv")

# Select features and target
X = df[['Price', 'Brand Name', 'Review Votes']].copy()

y = df['Rating']

# Handle missing values
X['Price'].fillna(X['Price'].median(), inplace=True)
X['Review Votes'].fillna(X['Review Votes'].median(), inplace=True)
X['Brand Name'].fillna(X['Brand Name'].mode()[0], inplace=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocessing: One-hot encode categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Brand Name'])
    ],
    remainder='passthrough'
)

# Model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"MAE  : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.2f}")


Model Performance:
MAE  : 1.30
RMSE : 1.53
R²   : 0.03


In [3]:
# ================================
# 1. IMPORT LIBRARIES
# ================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# ================================
# 2. LOAD DATASET
# ================================

df = pd.read_csv("Amazon_Unlocked_Mobile.csv")


# ================================
# 3. FEATURE SELECTION
# ================================

features = ['Price', 'Brand Name', 'Review Votes']
target = 'Rating'

X = df[features].copy()
y = df[target]


# ================================
# 4. HANDLE MISSING VALUES (SAFE)
# ================================

X['Price'] = X['Price'].fillna(X['Price'].median())
X['Review Votes'] = X['Review Votes'].fillna(X['Review Votes'].median())
X['Brand Name'] = X['Brand Name'].fillna(X['Brand Name'].mode()[0])


# ================================
# 5. TRAIN-TEST SPLIT
# ================================

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


# ================================
# 6. PREPROCESSING PIPELINE
# ================================

categorical_features = ['Brand Name']
numerical_features = ['Price', 'Review Votes']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)


# ================================
# 7. MODEL PIPELINE (BEST FIT)
# ================================

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ))
])


# ================================
# 8. TRAIN MODEL
# ================================

model.fit(X_train, y_train)


# ================================
# 9. PREDICTIONS
# ================================

y_pred = model.predict(X_test)


# ================================
# 10. MODEL EVALUATION
# ================================

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nMODEL PERFORMANCE (Random Forest Regressor)")
print("------------------------------------------")
print(f"MAE  : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.2f}")



MODEL PERFORMANCE (Random Forest Regressor)
------------------------------------------
MAE  : 1.25
RMSE : 1.48
R²   : 0.09


In [4]:
# ================================
# 1. IMPORT LIBRARIES
# ================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# ================================
# 2. LOAD DATA
# ================================

df = pd.read_csv("Amazon_Unlocked_Mobile.csv")


# ================================
# 3. CREATE BINARY TARGET
# ================================

df['Rating_Class'] = np.where(df['Rating'] > 3, 1, 0)

X = df[['Price', 'Brand Name', 'Review Votes']].copy()
y = df['Rating_Class']


# ================================
# 4. HANDLE MISSING VALUES
# ================================

X['Price'] = X['Price'].fillna(X['Price'].median())
X['Review Votes'] = X['Review Votes'].fillna(X['Review Votes'].median())
X['Brand Name'] = X['Brand Name'].fillna(X['Brand Name'].mode()[0])


# ================================
# 5. TRAIN-TEST SPLIT
# ================================

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# ================================
# 6. PREPROCESSING
# ================================

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Brand Name'])
    ],
    remainder='passthrough'
)


# ================================
# 7. MODEL PIPELINE
# ================================

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        random_state=42
    ))
])


# ================================
# 8. TRAIN MODEL
# ================================

model.fit(X_train, y_train)


# ================================
# 9. EVALUATION
# ================================

y_pred = model.predict(X_test)

print("\nCLASSIFICATION PERFORMANCE")
print("--------------------------")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



CLASSIFICATION PERFORMANCE
--------------------------
Accuracy: 0.6980113087183453

Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.09      0.16     25769
           1       0.70      0.97      0.82     56999

    accuracy                           0.70     82768
   macro avg       0.65      0.53      0.49     82768
weighted avg       0.67      0.70      0.61     82768


Confusion Matrix:
 [[ 2310 23459]
 [ 1536 55463]]


In [5]:
# ================================
# IMPROVED BALANCED CLASSIFIER
# ================================

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(
        n_estimators=300,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=3,
        class_weight='balanced',   # ⭐ KEY FIX
        random_state=42,
        n_jobs=-1
    ))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("\nBALANCED CLASSIFICATION PERFORMANCE")
print("----------------------------------")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



BALANCED CLASSIFICATION PERFORMANCE
----------------------------------
Accuracy: 0.6030591532959598

Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.64      0.50     25769
           1       0.78      0.58      0.67     56999

    accuracy                           0.60     82768
   macro avg       0.60      0.61      0.59     82768
weighted avg       0.67      0.60      0.62     82768


Confusion Matrix:
 [[16621  9148]
 [23706 33293]]


In [6]:
"""
============================================================
Data Science Internship - Task 1
Title : Titanic Survival Prediction
Author: Eliya Abbas Sayyed
Objective:
    Build a simple predictive model using Scikit-learn
    to predict passenger survival based on input features.
============================================================
"""

# -----------------------------------------------------------
# 1. Import Required Libraries
# -----------------------------------------------------------

import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# -----------------------------------------------------------
# 2. Load Dataset
# -----------------------------------------------------------
# Using Titanic dataset directly from seaborn
# This avoids manual downloads and ensures reproducibility

df = sns.load_dataset("titanic")

# Display basic dataset info (optional for notebook)
print("\nDataset Shape:", df.shape)
print("\nDataset Columns:\n", df.columns)


# -----------------------------------------------------------
# 3. Select Features and Target Variable
# -----------------------------------------------------------
# Target:
#   survived -> 0 (No), 1 (Yes)
# Features selected include both numerical and categorical
# to demonstrate preprocessing techniques

X = df[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = df['survived']


# -----------------------------------------------------------
# 4. Identify Feature Types
# -----------------------------------------------------------
# Numerical and categorical features are processed differently

numeric_features = ['age', 'fare']
categorical_features = ['sex', 'embarked', 'pclass']


# -----------------------------------------------------------
# 5. Create Preprocessing Pipelines
# -----------------------------------------------------------

# Numerical pipeline:
# - Fill missing values using median (robust to outliers)
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Categorical pipeline:
# - Fill missing values using most frequent category
# - Convert categories into numerical format using One-Hot Encoding
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine both pipelines into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)


# -----------------------------------------------------------
# 6. Split Data into Training and Testing Sets
# -----------------------------------------------------------
# 80% training data, 20% testing data

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


# -----------------------------------------------------------
# 7. Build the Machine Learning Pipeline
# -----------------------------------------------------------
# Pipeline ensures:
# - Clean preprocessing
# - No data leakage
# - Production-ready workflow

model = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])


# -----------------------------------------------------------
# 8. Train the Model
# -----------------------------------------------------------

model.fit(X_train, y_train)


# -----------------------------------------------------------
# 9. Make Predictions
# -----------------------------------------------------------

y_pred = model.predict(X_test)


# -----------------------------------------------------------
# 10. Evaluate Model Performance
# -----------------------------------------------------------

accuracy = accuracy_score(y_test, y_pred)

print("\nModel Evaluation Results")
print("-" * 30)
print(f"Accuracy Score: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


# -----------------------------------------------------------
# 11. Conclusion
# -----------------------------------------------------------
# The Logistic Regression model achieves ~78–80% accuracy,
# demonstrating strong baseline performance.
# The use of pipelines ensures scalability and maintainability.

print("\nModel training and evaluation completed successfully.")



Dataset Shape: (891, 15)

Dataset Columns:
 Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

Model Evaluation Results
------------------------------
Accuracy Score: 0.7933

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.84      0.83       105
           1       0.76      0.73      0.74        74

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.79       179
weighted avg       0.79      0.79      0.79       179


Model training and evaluation completed successfully.
