### Decision Tree

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/w207/final/DATASCI 207 Project/steam-games-cleaned-final.csv')

In [4]:
data.head(5)

Unnamed: 0,genres,discounted_price,about_description,awards,age_of_game,Action,Adventure,Animation & Modeling,Audio Production,Casual,...,RPG,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,binary_class
0,"Action, Free to Play",0.0,"For over two decades, Counter-Strike has offer...",1,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
1,"Action, Strategy, Free to Play",0.0,"Every day, millions of players worldwide enter...",0,11.0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,negative
2,"Action, RPG",3599.0,"THE NEW FANTASY ACTION RPG. Rise, Tarnished, a...",6,2.0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,positive
3,"Action, Adventure, Free to Play",0.0,Destiny 2 is an action MMO with a single evolv...,0,5.0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,negative
4,RPG,1499.0,"Cyberpunk 2077 is an open-world, action-advent...",4,4.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,positive


In [5]:
# Encode the binary class as 0 and 1
label_encoder = LabelEncoder()
data['binary_class_encoded'] = label_encoder.fit_transform(data['binary_class'])

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


In [7]:
# Prepare features
numerical_features = data[['discounted_price', 'age_of_game', 'awards']]
genre_features = data.iloc[:, 5:-2]  # Genre columns
text_features = data['about_description']

# Standardize numerical features
scaler = StandardScaler()
numerical_features_scaled = scaler.fit_transform(numerical_features)

# Create TF-IDF embeddings for text data
tfidf_vectorizer = TfidfVectorizer(max_features=500)
text_embeddings = tfidf_vectorizer.fit_transform(text_features).toarray()


In [8]:
# Prepare the final input features by concatenating all
X = np.hstack((numerical_features_scaled, genre_features, text_embeddings))
y = data['binary_class_encoded']

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the training data further into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)


In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

In [11]:
# Initialize the decision tree model
decision_tree = DecisionTreeClassifier(random_state=42)

# Define the hyperparameters and their values
param_grid = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': [None, 'sqrt', 'log2']
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator = decision_tree,
                           param_grid = param_grid,
                           cv = 5,
                           n_jobs = -1,
                           scoring = 'accuracy')
grid_search.fit(X_train, y_train)

  pid = os.fork()
  pid = os.fork()


In [12]:
# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")


Best hyperparameters: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}


In [13]:
# Train the decision tree model with the best hyperparameters
best_decision_tree = grid_search.best_estimator_
best_decision_tree.fit(X_train, y_train)

In [14]:
# Make predictions
y_train_pred = best_decision_tree.predict(X_train)
y_val_pred = best_decision_tree.predict(X_val)
y_test_pred = best_decision_tree.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy}")
print(f"Validation Accuracy: {test_accuracy}")
print("Classification Report (Validation):")
print(classification_report(y_test, y_test_pred))

Training Accuracy: 0.6654408656691183
Validation Accuracy: 0.6072017243565361
Classification Report (Validation):
              precision    recall  f1-score   support

           0       0.62      0.61      0.61      4039
           1       0.60      0.60      0.60      3848

    accuracy                           0.61      7887
   macro avg       0.61      0.61      0.61      7887
weighted avg       0.61      0.61      0.61      7887



In [15]:
# Make predictions
y_pred = best_decision_tree.predict(X_test)


In [16]:
# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


Accuracy: 0.6072017243565361
              precision    recall  f1-score   support

           0       0.62      0.61      0.61      4039
           1       0.60      0.60      0.60      3848

    accuracy                           0.61      7887
   macro avg       0.61      0.61      0.61      7887
weighted avg       0.61      0.61      0.61      7887

