In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


In [None]:
#load data
df = pd.read_csv(r"../video_game_sales_final_cleaned.csv")

In [None]:
unique_ratings=df["rating_grouped"].unique()
unique_ratings

In [None]:
#count each rating
print("\nRating Counts:\n", df['rating_grouped'].value_counts())

In [None]:
#filter for binary classification
#to predict adults vs Children
df_binary = df[df['rating_grouped'].isin(['Adults', 'Children'])].copy()
df_binary['is_mature'] = (df_binary['rating_grouped'] == 'Adults').astype(int)


In [None]:
#visualize rating distribution
df['rating_grouped'].value_counts().plot(kind='bar', color='skyblue')
plt.title("Distribution of Standardized Game Ratings")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [None]:
# Select features - adjust based on your actual columns
features = ['genre', 'publisher', 'na_sales', 'eu_sales', 'jp_sales', 'year_of_release']
X = df_binary[features]
y = df_binary['is_mature']

In [None]:

# Identify categorical and numerical features
categorical_features = ['genre', 'publisher']
numerical_features= ['na_sales','eu_sales','jp_sales','year_of_release']

In [None]:
# Split data (80% train, 20% test) with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  # Preserve class distribution
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print("\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))

In [None]:
# Create preprocessing transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create pipeline with preprocessing and model
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

In [None]:
# Train and evaluate Logistic Regression
pipeline_lr.fit(X_train, y_train)
y_pred_lr = pipeline_lr.predict(X_test)

print("Logistic Regression Results:")
print(f"Accuracy:{accuracy_score(y_test,y_pred_lr):.2f}")
print("\nClassification Report:")
print(classification_report(y_test ,y_pred_lr))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test,y_pred_lr))

In [None]:
# Train and evaluate Random Forest(better for imbalanced data),The random forest uses class_weight='balanced' to account for imbalanced classes
pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)

print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test,y_pred_rf))

In [None]:
# Get feature names after one-hot encoding
cat_encoder = pipeline_rf.named_steps['preprocessor'].named_transformers_['cat']
feature_names = numerical_features + list(cat_encoder.get_feature_names_out(categorical_features))


In [None]:

# Extract feature importances
importances = pipeline_rf.named_steps['classifier'].feature_importances_

In [None]:
# Create importance DataFrame,helps to identify which features most influence
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=False)


In [None]:
print("\nTop 10 Important Features:")
print(importance_df.head(10))