In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv(r"C:\Users\Prashal\Downloads\ACME-HappinessSurvey2020.csv")
x = df.drop("Y", axis=1)
y = df["Y"]

In [3]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.25,random_state=42,stratify=y)

In [4]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

model = LogisticRegression(max_iter=1000,class_weight='balanced')
model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print("\nTest Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Test Accuracy: 0.71875
Confusion Matrix:
 [[ 7  7]
 [ 2 16]]


In [5]:
coefficient=model.coef_[0]
abs_coefficient=np.abs(model.coef_[0])
print(abs_coefficient)

[0.27363574 0.16508311 0.08465613 0.01009714 0.2472131  0.06041236]


In [6]:
selected_features = ["X1","X2","X5"]

x_sel = df[selected_features]

x_train_s, x_test_s, y_train_s, y_test_s = train_test_split(x_sel, y,test_size=0.25,random_state=42,stratify=y)

x_train_s_scaled = scaler.fit_transform(x_train_s)
x_test_s_scaled = scaler.transform(x_test_s)

model = LogisticRegression(max_iter=1000,class_weight='balanced')
model.fit(x_train_s_scaled, y_train_s)

y_pred = model.predict(x_test_s_scaled)
accuracy = accuracy_score(y_test_s, y_pred)\

print("\nTest Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Test Accuracy: 0.71875
Confusion Matrix:
 [[ 7  7]
 [ 2 16]]


In [7]:
base_model = LogisticRegression(max_iter=1000,class_weight='balanced',solver='liblinear')

cv_scores = cross_val_score(base_model,x_train_scaled,y_train,cv=5,scoring='accuracy')

print("Cross Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Cross Validation Scores: [0.47368421 0.31578947 0.52631579 0.36842105 0.77777778]
Mean CV Accuracy: 0.4923976608187135


In [8]:
param_grid = {'C': [0.01, 0.1, 1, 10, 100],'penalty': ['l1', 'l2'],'solver': ['liblinear']}

grid = GridSearchCV(LogisticRegression(max_iter=1000,class_weight='balanced'),param_grid,cv=5,scoring='accuracy')

grid.fit(x_train_scaled, y_train)

print("\nBest Parameters:", grid.best_params_)
print("Best Accuracy:", grid.best_score_)


Best Parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best Accuracy: 0.5345029239766081


In [9]:
best_model = grid.best_estimator_

y_pred = best_model.predict(x_test_scaled)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Test Accuracy: 0.71875
Confusion Matrix:
 [[ 7  7]
 [ 2 16]]


In [10]:
rf_model = RandomForestClassifier(n_estimators=300,max_depth=5,random_state=42)

rf_model.fit(x_train, y_train)

y_pred = rf_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)

print("\nTest Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Test Accuracy: 0.75
Confusion Matrix:
 [[10  4]
 [ 4 14]]


In [11]:
rf_model = RandomForestClassifier(n_estimators=500,max_depth=3,random_state=42)

rf_model.fit(x_train, y_train)

y_pred = rf_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)

print("\nTest Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Test Accuracy: 0.75
Confusion Matrix:
 [[ 7  7]
 [ 1 17]]
