In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Data Description

In [None]:
df = pd.read_csv("assign.csv")
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

# Data visualization

In [None]:
# plot correlation matrix
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), cmap='coolwarm', annot=True)

In [None]:
import scipy.stats as stats
stats.pointbiserialr(df["y"], df["x9"])

In [None]:
sns.pairplot(df,hue='y',palette='coolwarm')

In [None]:
sns.boxplot(x="x7", y="x9", hue="y",data=df, palette="coolwarm")

In [None]:
sns.jointplot(x='x1',y='x9',data=df,kind='scatter', hue='y')

In [None]:
sns.jointplot(x='x10',y='x17',data=df,kind='scatter', hue='y')

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x='x2',y='x13',data=df, hue='y', palette='rainbow')

# Data Preprocessing
Process null value

In [None]:
print(df['x2'].value_counts())

In [None]:
print(df['x3'].value_counts())

In [None]:
print(df['x4'].value_counts())

In [None]:
print(df['x5'].value_counts())

In [None]:
print(df['x6'].value_counts())

In [None]:
print(df['x7'].value_counts())

In [None]:
print(df['x8'].value_counts())

Now we can conclude that null value will include NaN and unknown for categorical data. Besides, there will have NaN value for x13 for numerical data. So we need to tell pandas to treat these values as null value.

In [None]:
na_values = ['NaN', "unknown"]
na_values = ['NaN', "unknown"]
data = pd.read_csv("assign.csv", na_values=na_values)

In [None]:
data.isnull().sum()

In [None]:
data.isnull().sum().plot(kind='bar', figsize=(10, 5))

In [None]:
median = data['x14'].median()
data['x14'].fillna(median, inplace=True)

In [None]:
# most frequently value for x3
data.mode().loc[:,['x2', 'x3', 'x4', 'x5', 'x6', 'x8']]

In [None]:
# Fill x3, x2, x8 with most frequently occuring value
data['x2'].fillna("admin", inplace=True)
data['x3'].fillna("married", inplace=True)
data['x4'].fillna("university", inplace=True)
data['x5'].fillna("yes", inplace=True)
data['x6'].fillna("no", inplace=True)
data['x8'].fillna("cell", inplace=True)

In [None]:
sns.countplot(x="x11", data=data)

In [None]:
# Since there are only 3 percent of data that is not 999.
# In other word, there are 97 percent of data value is 999
# Therefore, this column doesn't have any value for modeling.
data['x11'][data['x11'] != 999].count() / data['x11'][data['x11'] == 999].count()

In [None]:
data.drop("x11", axis=1, inplace=True)

In [None]:
sns.countplot(x="x12", data=data)

In [None]:
# There are 15 percent of data value is not 0
# In other word, there are 85 percent of data value is 0
# This column may have some value for modeling, however 
# we would drop it for fine tuning section to see whether the performance of model improve if we drop this column.
data['x12'][data['x12'] != 0].count() / data['x12'][data['x12'] == 0].count()

Convert categorical data to numerical data using one hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

In [None]:
X = data.drop('y', axis=1)
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split train data to categorical and numerical
X_train_num = X_train.drop(['x2', 'x3', 'x4', 'x5', 'x6', 'x7' ,'x8'], axis=1)
X_train_cat = X_train[['x2', 'x3', 'x4', 'x5', 'x6', 'x7' ,'x8']]

In [None]:
# Initialize preprocessing instances
poly = PolynomialFeatures(degree=2)
scalar = StandardScaler()
encode = OneHotEncoder()

full_pipeline = ColumnTransformer([
        ("scaler", scalar, X_train_num.columns),
        ("encode", encode, X_train_cat.columns),
    ])
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_preapred = full_pipeline.transform(X_test)

In [None]:
# After deal with categorical and numerical data, we need to resample the data since 
# it has unbalance data
# So, false value is far greater than true value
y.value_counts().plot.pie()

### Undersampling
This resampling technique is temparory, more resampling technique will be examine in fine tune section

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# Now we need to resample the data to balance the data
rus = RandomUnderSampler(random_state=42, sampling_strategy=1)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_prepared, y_train)

ax = y_train_resampled.value_counts().plot.pie()
ax.set_title("undersampling")


# Model training and validation

#### Model 1 - SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=500, tol=1e-3, random_state=42)
sgd_clf.fit(X_train_resampled, y_train_resampled)

In [None]:
predictions = sgd_clf.predict(X_test_preapred)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

In [None]:
y_test.value_counts()

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, predictions))

#### Model 1 fine tuing and testing

It seem like this model is not overfitting

In [None]:
# Initialize preprocessing instances
poly = PolynomialFeatures(degree=2)
scalar = StandardScaler()
encode = OneHotEncoder()
full_pipeline = ColumnTransformer([
        # polynomial transformation
        ("poly", poly, X_train_num[['x10', 'x13']].columns),
        ("scaler", scalar, X_train_num.columns),
        ("encode", encode, X_train_cat.columns),
    ])
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_preapred = full_pipeline.transform(X_test)

# Now we need to resample the data to balance the data
rus = RandomUnderSampler(random_state=42, sampling_strategy=1)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_prepared, y_train)

sgd_clf = SGDClassifier(max_iter=500, tol=1e-3, random_state=42)
sgd_clf.fit(X_train_resampled, y_train_resampled)
predictions = sgd_clf.predict(X_train_resampled)        
print(classification_report(y_train_resampled, predictions))
print(confusion_matrix(y_train_resampled, predictions))

In [None]:
# Try use different resampling technique
# But it seem like no different
from imblearn.under_sampling import EditedNearestNeighbours
# Now we need to resample the data to balance the data
enn  = EditedNearestNeighbours()
X_train_resampled, y_train_resampled = enn.fit_resample(X_train_prepared, y_train)

ax = y_train_resampled.value_counts().plot.pie()
ax.set_title("undersampling")

In [None]:
# Initialize preprocessing instances
poly = PolynomialFeatures(degree=2)
scalar = StandardScaler()
encode = OneHotEncoder()
X_train_num = X_train_num.drop("x12", axis=1) # Fine tune 
full_pipeline = ColumnTransformer([
        # polynomial transformation
        ("poly", poly, X_train_num[['x10', 'x13']].columns), # Fine tune 
        ("scaler", scalar, X_train_num.columns),
        ("encode", encode, X_train_cat.columns),
    ])
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_preapred = full_pipeline.transform(X_test)

# Now we need to resample the data to balance the data
rus = RandomUnderSampler(random_state=42, sampling_strategy=1)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_prepared, y_train)

sgd_clf = SGDClassifier(max_iter=500, tol=1e-3, random_state=42)
sgd_clf.fit(X_train_resampled, y_train_resampled)
predictions = sgd_clf.predict(X_test_preapred)        
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))