In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import joblib
import os

TITANIC DATASET

In [4]:
# Load dataset
titanic = pd.read_csv("titanic.csv")

# Preprocessing
titanic = titanic[['Pclass', 'Age', 'SibSp', 'Fare', 'Sex', 'Survived']]
titanic.dropna(inplace=True)

# Encode 'Sex'
titanic['Sex'] = titanic['Sex'].map({'male': 1, 'female': 0})

X = titanic.drop("Survived", axis=1)
y = titanic['Survived']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Save model
joblib.dump(clf, "ml_models/titanic_model.pkl")


['ml_models/titanic_model.pkl']

INDIAN MOVIE DATASET

In [5]:
# Load dataset with correct encoding
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import joblib

# Read CSV
movie = pd.read_csv("imdb_indian_movies.csv", encoding='latin1')

# Select relevant columns
movie = movie[['Genre', 'Director', 'Actor 1', 'Rating']].dropna()

# Encode categorical columns
le = LabelEncoder()
for col in ['Genre', 'Director', 'Actor 1']:
    movie[col] = le.fit_transform(movie[col])

# Features and target
X = movie[['Genre', 'Director', 'Actor 1']]
y = movie['Rating']

# Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor()
model.fit(X_train, y_train)

# Save model
joblib.dump(model, "ml_models/movie_model.pkl")


['ml_models/movie_model.pkl']

IRIS DATASET

In [6]:
# Load dataset
iris = pd.read_csv("iris.csv")  # use this instead of sklearn load_iris for full preprocessing flexibility

X = iris.drop("species", axis=1)
y = iris["species"]

# Encode target
le = LabelEncoder()
y = le.fit_transform(y)

# Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

joblib.dump(model, "ml_models/iris_model.pkl")


['ml_models/iris_model.pkl']

SALES DATASET

In [7]:
# Load dataset
sales = pd.read_csv("advertising.csv")  # filename corrected

X = sales[['TV', 'Radio', 'Newspaper']]
y = sales['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

joblib.dump(model, "ml_models/sales_model.pkl")


['ml_models/sales_model.pkl']

FRAUD_APP DATASET

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load dataset
fraud = pd.read_csv("creditcard.csv")

# Add Averaged_V feature (mean of V1 to V28)
v_cols = [f'V{i}' for i in range(1, 29)]
fraud['Averaged_V'] = fraud[v_cols].mean(axis=1)

# Undersample majority class
fraud_0 = fraud[fraud['Class'] == 0].sample(10000, random_state=42)
fraud_1 = fraud[fraud['Class'] == 1]
balanced = pd.concat([fraud_0, fraud_1])

X = balanced.drop('Class', axis=1)
y = balanced['Class']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Save model
joblib.dump(model, "ml_models/fraud_model_with_avg.pkl")


['ml_models/fraud_model_with_avg.pkl']

imdb columns

In [9]:
import pandas as pd

df = pd.read_csv("imdb_indian_movies.csv", encoding='latin1')
print(df.columns.tolist())


['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
