# EDA

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv("./ACME-HappinessSurvey2020.csv")
df.head()

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6
0,0,3,3,3,4,2,4
1,0,3,2,3,5,4,3
2,1,5,3,3,3,3,5
3,0,5,4,3,3,3,5
4,0,5,4,3,3,3,5


In [12]:
df.describe()

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6
count,126.0,126.0,126.0,126.0,126.0,126.0,126.0
mean,0.547619,4.333333,2.531746,3.309524,3.746032,3.650794,4.253968
std,0.499714,0.8,1.114892,1.02344,0.875776,1.147641,0.809311
min,0.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,4.0,2.0,3.0,3.0,3.0,4.0
50%,1.0,5.0,3.0,3.0,4.0,4.0,4.0
75%,1.0,5.0,3.0,4.0,4.0,4.0,5.0
max,1.0,5.0,5.0,5.0,5.0,5.0,5.0


Checked that there is no missing or malformed data

Summarising the data where the review was positive and negative

In [4]:
X, y = df.drop(["Y"], axis=1), df["Y"]
X_pos = X.loc[y == 1]
X_neg = X.loc[y == 0]


In [5]:
X_pos.describe()

Unnamed: 0,X1,X2,X3,X4,X5,X6
count,69.0,69.0,69.0,69.0,69.0,69.0
mean,4.536232,2.507246,3.449275,3.797101,3.884058,4.376812
std,0.698311,1.106441,1.022342,0.900649,1.064621,0.768908
min,3.0,1.0,1.0,1.0,1.0,1.0
25%,4.0,2.0,3.0,3.0,3.0,4.0
50%,5.0,2.0,3.0,4.0,4.0,5.0
75%,5.0,3.0,4.0,4.0,5.0,5.0
max,5.0,5.0,5.0,5.0,5.0,5.0


In [6]:
X_neg.describe()

Unnamed: 0,X1,X2,X3,X4,X5,X6
count,57.0,57.0,57.0,57.0,57.0,57.0
mean,4.087719,2.561404,3.140351,3.684211,3.368421,4.105263
std,0.851064,1.13417,1.007802,0.848484,1.189712,0.838455
min,1.0,1.0,1.0,2.0,1.0,2.0
25%,4.0,2.0,3.0,3.0,3.0,3.0
50%,4.0,3.0,3.0,4.0,4.0,4.0
75%,5.0,3.0,4.0,4.0,4.0,5.0
max,5.0,5.0,5.0,5.0,5.0,5.0


Would have expected the mean vals for all of the variables to be higher for the positive reviews given the questions but `X2` is actually lower.

In [7]:
X_pos.mean() - X_neg.mean()

X1    0.448513
X2   -0.054157
X3    0.308924
X4    0.112891
X5    0.515637
X6    0.271548
dtype: float64

Splitting the data into train test split and using K Fold cross validation to examine the performance of some preliminary classification models before any hyperparameter or feature selection / tuning

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
kf = KFold(n_splits=5)
models = [LogisticRegression(), RandomForestClassifier()]
for model in models:
    res = cross_val_score(model, X_train, y_train, scoring="accuracy", cv = kf)
    print(res.mean())

0.5526315789473684
0.5842105263157895


## Things to try
- Feature selection - select a subset of features based on information gain or other predective capability or importance metric
- XGBoost or a gradient boosted tree approach
- Hyperparameter tuning using grid search, random search or bayesian optimisation methods
- Support vector classifier, naive bayes, other classification models