In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [2]:
# Assuming your data is loaded into a DataFrame called `data`
raw_data = pd.read_csv('speed_data_data.csv')
data = raw_data.copy()

In [8]:

imputer = SimpleImputer(strategy='mean')
data.iloc[:, :] = imputer.fit_transform(data)



In [10]:
missing_values = data.isnull().sum()
print(missing_values)

gender    0
age       0
income    0
goal      0
dec       0
attr      0
sinc      0
intel     0
fun       0
amb       0
shar      0
like      0
prob      0
met       0
dtype: int64


In [11]:
# Convert all columns except 'dec' and 'gender' to numeric if they are not
for col in data.columns:
    if col != 'dec' and col != 'gender':
        data[col] = pd.to_numeric(data[col], errors='coerce')

# Convert 'gender' to numeric if it's a string
if data['gender'].dtype == object:
    data['gender'] = data['gender'].astype('category').cat.codes

# Ensure target variable is numeric
data['dec'] = pd.to_numeric(data['dec'], errors='coerce')

# Re-impute if any coercion resulted in NaNs
data.iloc[:, :] = imputer.fit_transform(data)


In [14]:
X = data.drop(columns=['dec'])
y = data['dec']

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)


In [25]:
count_1 = 0
count_0 = 0



for value in y_res:
    if value == 1:
        count_1 += 1
    elif value == 0:
        count_0 += 1
        
print(f'Count of class 1: {count_1}')
print(f'Count of class 0: {count_0}')


Count of class 1: 4860
Count of class 0: 4860


In [26]:
#Splitting into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)


In [27]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [30]:
model=RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

In [31]:
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix (y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.8317901234567902
Confusion Matrix:
[[788 158]
 [169 829]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.83      0.83       946
           1       0.84      0.83      0.84       998

    accuracy                           0.83      1944
   macro avg       0.83      0.83      0.83      1944
weighted avg       0.83      0.83      0.83      1944

