In [426]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors

from sklearn.utils import resample
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import scale
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.decomposition import PCA

In [427]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [428]:
train_data.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [429]:
test_data.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,458989,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,B
1,458994,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,A
2,458996,Female,Yes,69,No,,0.0,Low,1.0,Cat_6,A
3,459000,Male,Yes,59,No,Executive,11.0,High,2.0,Cat_6,B
4,459001,Female,No,19,No,Marketing,,Low,4.0,Cat_6,A


In [430]:
print(train_data.shape)
print(test_data.shape)

(8068, 11)
(2627, 11)


In [431]:
df = pd.concat([train_data, test_data]).drop_duplicates(subset='ID').reset_index(drop=True)

In [432]:
df.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [433]:
df.drop(['ID'], axis=1, inplace=True)
df.columns

Index(['Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1',
       'Segmentation'],
      dtype='object')

In [434]:
columns = ['Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession', 'Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1', 'Segmentation']
ids = df[columns]
# df[ids.isin(ids[ids.duplicated()])].sort_values('E')

In [435]:
df.drop_duplicates(inplace=True)

In [436]:
df.duplicated(subset=columns).value_counts()

False    7926
dtype: int64

In [437]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
8358    False
8359    False
8360    False
8361    False
8362    False
Length: 7926, dtype: bool

## 1. Data Understanding
- Dataframe shape
- head and tail
- dtypes
- describe

In [438]:
df.shape

(7926, 10)

In [439]:
df.dtypes

Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
Segmentation        object
dtype: object

## 2. Data Preprocessing
- Cek null data

In [440]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7926 entries, 0 to 8362
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           7926 non-null   object 
 1   Ever_Married     7782 non-null   object 
 2   Age              7926 non-null   int64  
 3   Graduated        7846 non-null   object 
 4   Profession       7798 non-null   object 
 5   Work_Experience  7094 non-null   float64
 6   Spending_Score   7926 non-null   object 
 7   Family_Size      7585 non-null   float64
 8   Var_1            7846 non-null   object 
 9   Segmentation     7926 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 681.1+ KB


In [441]:
df['Ever_Married'].fillna('No', inplace=True)
df['Graduated'].fillna('No', inplace=True)
df['Profession'].fillna(df['Profession'].mode()[0], inplace=True)
df['Work_Experience'].fillna(0, inplace=True)
df['Family_Size'].fillna(df['Family_Size'].mode()[0], inplace=True)
df['Var_1'].fillna(df['Var_1'].mode()[0], inplace=True)

In [442]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7926 entries, 0 to 8362
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           7926 non-null   object 
 1   Ever_Married     7926 non-null   object 
 2   Age              7926 non-null   int64  
 3   Graduated        7926 non-null   object 
 4   Profession       7926 non-null   object 
 5   Work_Experience  7926 non-null   float64
 6   Spending_Score   7926 non-null   object 
 7   Family_Size      7926 non-null   float64
 8   Var_1            7926 non-null   object 
 9   Segmentation     7926 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 681.1+ KB


In [443]:
df.head()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,Female,Yes,38,Yes,Engineer,0.0,Average,3.0,Cat_4,A
2,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,Female,Yes,40,Yes,Entertainment,0.0,High,6.0,Cat_6,A


Converting object to nums

In [444]:
# Gender
df.replace({ 
    'Gender': { 'Male': 1, 'Female': 0 },
    'Ever_Married': {'Yes': 1, 'No': 0},
    'Graduated': {'Yes': 1, 'No': 0},
    'Spending_Score': {'Low': 1, 'Average': 2, 'High': 3},
    'Var_1': {
        'Cat_4': 4, 
        'Cat_6': 6, 
        'Cat_7': 7, 
        'Cat_3': 3, 
        'Cat_1': 1, 
        'Cat_2': 2, 
        'Cat_5': 5
    }
}, inplace=True)

In [445]:
df = pd.get_dummies(df, columns=['Profession'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7926 entries, 0 to 8362
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Gender                    7926 non-null   int64  
 1   Ever_Married              7926 non-null   int64  
 2   Age                       7926 non-null   int64  
 3   Graduated                 7926 non-null   int64  
 4   Work_Experience           7926 non-null   float64
 5   Spending_Score            7926 non-null   int64  
 6   Family_Size               7926 non-null   float64
 7   Var_1                     7926 non-null   int64  
 8   Segmentation              7926 non-null   object 
 9   Profession_Artist         7926 non-null   uint8  
 10  Profession_Doctor         7926 non-null   uint8  
 11  Profession_Engineer       7926 non-null   uint8  
 12  Profession_Entertainment  7926 non-null   uint8  
 13  Profession_Executive      7926 non-null   uint8  
 14  Professi

In [446]:
df.head()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
0,1,0,22,0,1.0,1,4.0,4,D,0,0,0,0,0,1,0,0,0
1,0,1,38,1,0.0,2,3.0,4,A,0,0,1,0,0,0,0,0,0
2,0,1,67,1,1.0,1,1.0,6,B,0,0,1,0,0,0,0,0,0
3,1,1,67,1,0.0,3,2.0,6,B,0,0,0,0,0,0,0,1,0
4,0,1,40,1,0.0,3,6.0,6,A,0,0,0,1,0,0,0,0,0


In [447]:
df.describe()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Work_Experience,Spending_Score,Family_Size,Var_1,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
count,7926.0,7926.0,7926.0,7926.0,7926.0,7926.0,7926.0,7926.0,7926.0,7926.0,7926.0,7926.0,7926.0,7926.0,7926.0,7926.0,7926.0
mean,0.544411,0.578854,43.501893,0.617588,2.436159,1.543906,2.816679,5.113172,0.32488,0.0892,0.089452,0.122634,0.073934,0.153671,0.031668,0.0757,0.038859
std,0.498055,0.493774,16.526745,0.486007,3.35709,0.740724,1.527483,1.436324,0.46836,0.28505,0.285414,0.328038,0.26168,0.360656,0.175125,0.264535,0.193272
min,0.0,0.0,18.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,31.0,0.0,0.0,1.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,40.0,1.0,1.0,1.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,53.0,1.0,4.0,2.0,4.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,89.0,1.0,14.0,3.0,9.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [448]:
df.corr(numeric_only=True)

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Work_Experience,Spending_Score,Family_Size,Var_1,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
Gender,1.0,0.122754,0.039464,-0.03487,-0.056603,0.067396,0.054242,0.03028,-0.040493,0.011645,-0.214803,0.138117,0.227497,0.021009,-0.135478,-0.018816,-0.034972
Ever_Married,0.122754,1.0,0.543905,0.17431,-0.092343,0.584964,-0.05871,0.094255,0.155748,-0.07822,0.01217,0.023646,0.191203,-0.393993,-0.019397,0.196768,-0.095579
Age,0.039464,0.543905,1.0,0.214537,-0.181745,0.403443,-0.260714,0.172366,0.100902,-0.113671,-0.035441,-0.015847,0.126071,-0.415954,-0.05951,0.543271,-0.081797
Graduated,-0.03487,0.17431,0.214537,1.0,0.040835,0.082924,-0.210027,0.127803,0.328011,-0.02517,-0.109042,0.005307,-0.075312,-0.204608,-0.017812,-0.007413,-0.095669
Work_Experience,-0.056603,-0.092343,-0.181745,0.040835,1.0,-0.076333,-0.056037,0.037886,0.017165,-0.003081,-0.010698,0.012037,-0.030105,0.00727,0.166879,-0.114621,-0.008817
Spending_Score,0.067396,0.584964,0.403443,0.082924,-0.076333,1.0,0.108769,0.079596,0.028531,-0.080404,-0.021266,-0.056436,0.341946,-0.249619,-0.019961,0.1949,-0.069211
Family_Size,0.054242,-0.05871,-0.260714,-0.210027,-0.056037,0.108769,1.0,-0.135305,-0.135626,0.005972,0.029515,-0.014307,0.11094,0.216747,-0.072165,-0.159264,0.029262
Var_1,0.03028,0.094255,0.172366,0.127803,0.037886,0.079596,-0.135305,1.0,0.08583,-0.011715,-0.057941,-0.031602,0.034808,-0.086923,-0.034817,0.092023,-0.030844
Profession_Artist,-0.040493,0.155748,0.100902,0.328011,0.017165,0.028531,-0.135626,0.08583,1.0,-0.217091,-0.217428,-0.25935,-0.196007,-0.295596,-0.125449,-0.198524,-0.139485
Profession_Doctor,0.011645,-0.07822,-0.113671,-0.02517,-0.003081,-0.080404,0.005972,-0.011715,-0.217091,1.0,-0.098088,-0.117,-0.088424,-0.133352,-0.056594,-0.08956,-0.062925


Split the data

In [449]:
X = df.drop('Segmentation', axis=1).copy()
y = df['Segmentation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = scale(X_train)
X_test = scale(X_test)

## 3. Time to make Model

In [450]:
svm = SVC()
svm.fit(X_train, y_train)
predicts = svm.predict(X_test)

In [451]:
print(classification_report(y_test, predicts))

              precision    recall  f1-score   support

           A       0.40      0.43      0.42       390
           B       0.39      0.34      0.37       388
           C       0.55      0.48      0.51       379
           D       0.57      0.67      0.62       429

    accuracy                           0.49      1586
   macro avg       0.48      0.48      0.48      1586
weighted avg       0.48      0.49      0.48      1586



In [452]:
from sklearn.neighbors import KNeighborsClassifier
import math

In [453]:
n_neighbors = round(math.sqrt(len(X_train)))
if (n_neighbors % 2 == 0):
    n_neighbors -= 1

In [454]:
model_knn = KNeighborsClassifier(111)
model_knn.fit(X_train, y_train)

predicts = model_knn.predict(X_test)
print(classification_report(y_test, predicts))


              precision    recall  f1-score   support

           A       0.39      0.42      0.40       390
           B       0.37      0.26      0.30       388
           C       0.49      0.53      0.51       379
           D       0.58      0.65      0.61       429

    accuracy                           0.47      1586
   macro avg       0.45      0.46      0.46      1586
weighted avg       0.46      0.47      0.46      1586

