In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from kneed import KneeLocator
from numpy.linalg import norm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score
import sqlite3 as sql

In [2]:
df = pd.read_csv('telco.csv')

In [3]:
num_cols = []
categorical_cols = []

In [4]:
def sort_cols(data):


    for i in data.columns:
        if data[i].dtypes == "object":
            categorical_cols.append(i)
        else :
            num_cols.append(i)

In [5]:
sort_cols(df)

In [6]:
def fix_cat_cols(data,cat_columns):
    for i in cat_columns:
        data[i] = data[i].fillna(data[i].mode()[0])

In [7]:
fix_cat_cols(df,categorical_cols)

In [8]:
def fix_num_cols(data,num_columns):
    for i in num_columns:
        data[i] = data[i].fillna(data[i].mean())

In [9]:
fix_num_cols(df,num_cols)

In [10]:
df['Total_volume (Bytes)'] = df['Total DL (Bytes)'] + df['Total UL (Bytes)']

In [11]:
df['Session frequency'] = df.groupby(['Bearer Id'])['Dur. (ms)'].transform('count')

In [12]:
X = df[['Session frequency','Dur. (ms)','Total_volume (Bytes)']]

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
col_names = ["Session frequency","Dur. (ms)",'Total_volume (Bytes)']
X[col_names] = scaler.fit_transform(X[col_names])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [14]:
kmeans = KMeans(
    init= "random",
    n_clusters = 3,
    n_init= 10,
    max_iter= 300,
    random_state= 42
)

In [15]:
kmeans.fit(X)

KMeans(init='random', n_clusters=3, random_state=42)

In [16]:
X['Engagement_labels'] = kmeans.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Engagement_labels'] = kmeans.labels_


In [17]:
X

Unnamed: 0,Session frequency,Dur. (ms),Total_volume (Bytes),Engagement_labels
0,-0.084545,21.213047,-0.614351,1
1,-0.084545,15.554551,0.865130,0
2,-0.084545,15.513311,-0.769595,1
3,-0.084545,15.016588,1.610557,0
4,-0.084545,12.147531,0.457962,0
...,...,...,...,...
149996,-0.084545,-0.288492,0.556671,0
149997,-0.084545,-0.081920,0.859396,0
149998,-0.072074,-0.078477,0.539872,0
149999,-0.072074,-0.082660,-0.385639,1


In [18]:
X.describe()

Unnamed: 0,Session frequency,Dur. (ms),Total_volume (Bytes),Engagement_labels
count,150001.0,150001.0,150001.0,150001.0
mean,2.73794e-17,1.818977e-17,-1.293653e-16,0.507617
std,1.000003,1.000003,1.000003,0.512988
min,-0.08454529,-1.20274,-1.910174,0.0
25%,-0.08454529,-0.5820368,-0.8645881,0.0
50%,-0.08454529,-0.2247065,0.004486279,1.0
75%,-0.08454529,0.3433174,0.8623481,1.0
max,12.26188,21.65339,1.883214,2.0


In [19]:
X1 = X[X.Engagement_labels==1]
X0 = X[X.Engagement_labels==0]
X2 = X[X.Engagement_labels==2]

In [20]:
needed_cols = ['Avg RTT DL (ms)','Avg RTT UL (ms)','Avg Bearer TP DL (kbps)','Avg Bearer TP UL (kbps)','TCP DL Retrans. Vol (Bytes)','TCP UL Retrans. Vol (Bytes)']

In [21]:
for _ in needed_cols:
    Q1 = df[_].quantile(0.25)
    Q3 = df[_].quantile(0.75)
    IQR = Q1 - Q3
    df[_] = np.where(df[_] < Q1 - 1.5 * IQR, df[_].mean(), df[_] )
    df[_] = np.where(df[_] > Q3 - 1.5 * IQR, df[_].mean(), df[_] )
    print(df[_].skew())

3.8393826668361446
2.9992848178475784
2.6404715453138103
1.8691868667041291
9.258979055345582
13.021441390853878


In [22]:
df['AVG RTT (ms)'] = df['Avg RTT DL (ms)'] + df['Avg RTT UL (ms)']
df['Avg Bearer TP (kbps)'] = df['Avg Bearer TP DL (kbps)'] + df['Avg Bearer TP UL (kbps)']
df['TCP Retrans. Vol (Bytes)'] = df['TCP DL Retrans. Vol (Bytes)'] + df['TCP UL Retrans. Vol (Bytes)']

In [23]:
df

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes),Total_volume (Bytes),Session frequency,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes)
0,1.311448e+19,4/4/2019 12:01,770.0000,4/25/2019 14:35,662.00000,1.823652e+06,2.082014e+14,3.366496e+10,3.552121e+13,9.16456699548519E+015,...,1.434415e+07,1.717444e+08,8.814393e+06,3.674974e+07,3.088796e+08,3.456294e+08,1,127.458589,15070.474573,2.156957e+07
1,1.311448e+19,4/9/2019 13:04,235.0000,4/25/2019 8:15,606.00000,1.365104e+06,2.082019e+14,3.368185e+10,3.579401e+13,L77566A,...,1.170709e+06,5.269042e+08,1.505514e+07,5.380039e+07,6.533850e+08,7.071854e+08,1,127.458589,15070.474573,2.156957e+07
2,1.311448e+19,4/9/2019 17:42,1.0000,4/25/2019 11:58,652.00000,1.361762e+06,2.082003e+14,3.376063e+10,3.528151e+13,D42335A,...,3.956300e+05,4.106926e+08,4.215763e+06,2.788364e+07,2.798073e+08,3.076910e+08,1,127.458589,15070.474573,2.156957e+07
3,1.311448e+19,4/10/2019 0:31,486.0000,4/25/2019 7:36,171.00000,1.321509e+06,2.082014e+14,3.375034e+10,3.535661e+13,T21824A,...,1.084972e+07,7.490399e+08,1.279728e+07,4.332422e+07,8.460285e+08,8.893527e+08,1,127.458589,15070.474573,2.156957e+07
4,1.311448e+19,4/12/2019 20:10,565.0000,4/25/2019 10:40,954.00000,1.089009e+06,2.082014e+14,3.369980e+10,3.540701e+13,D88865A,...,3.529801e+06,5.507095e+08,1.391032e+07,3.854281e+07,5.691386e+08,6.076814e+08,1,127.458589,15070.474573,2.156957e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149996,7.277826e+18,4/29/2019 7:28,451.0000,4/30/2019 6:02,214.00000,8.123000e+04,2.082022e+14,3.365069e+10,3.548311e+13,D20434A,...,9.197207e+06,3.264510e+06,1.348742e+07,5.762885e+07,5.741753e+08,6.318041e+08,1,127.458589,15070.474573,2.156957e+07
149997,7.349883e+18,4/29/2019 7:28,483.0000,4/30/2019 10:41,187.00000,9.797000e+04,2.082019e+14,3.366345e+10,3.566051e+13,D10223C,...,4.735033e+06,7.121804e+08,2.457758e+06,3.913508e+07,6.666488e+08,7.057839e+08,1,127.458589,15070.474573,2.156957e+07
149998,1.311448e+19,4/29/2019 7:28,283.0000,4/30/2019 10:46,810.00000,9.824900e+04,2.082017e+14,3.362189e+10,3.572121e+13,T51102A,...,1.339432e+07,1.211009e+08,1.131473e+07,3.491222e+07,5.927864e+08,6.276986e+08,2,127.458589,15070.474573,2.156957e+07
149999,1.311448e+19,4/29/2019 7:28,696.0000,4/30/2019 10:40,327.00000,9.791000e+04,2.082021e+14,3.361962e+10,8.618620e+13,L88342B,...,2.529475e+06,8.147131e+08,1.406930e+06,2.962610e+07,3.718959e+08,4.015220e+08,2,127.458589,15070.474573,2.156957e+07


In [24]:
scaler = StandardScaler()
cols_scaled = ['AVG RTT (ms)','Avg Bearer TP (kbps)','TCP Retrans. Vol (Bytes)']

In [25]:
df[cols_scaled] = scaler.fit_transform(df[cols_scaled])


In [26]:
Y = df[['AVG RTT (ms)','Avg Bearer TP (kbps)','TCP Retrans. Vol (Bytes)']]

In [27]:
Y

Unnamed: 0,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes)
0,-0.348097,-0.417691,-0.14053
1,-0.348097,-0.417691,-0.14053
2,-0.348097,-0.417691,-0.14053
3,-0.348097,-0.417691,-0.14053
4,-0.348097,-0.417691,-0.14053
...,...,...,...
149996,-0.348097,-0.417691,-0.14053
149997,-0.348097,-0.417691,-0.14053
149998,-0.348097,-0.417691,-0.14053
149999,-0.348097,-0.417691,-0.14053


In [28]:
kmeans.fit(Y)

KMeans(init='random', n_clusters=3, random_state=42)

In [29]:
Y['Experience_labels'] = kmeans.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y['Experience_labels'] = kmeans.labels_


In [30]:
Y['Experience_labels'].value_counts()

1    123247
0     15296
2     11458
Name: Experience_labels, dtype: int64

In [31]:
Y1 = Y[Y.Experience_labels==1]
Y0 = Y[Y.Experience_labels==0]
Y2 = Y[Y.Experience_labels==2]

In [32]:
X['Engagement_score'] =np.sqrt(((X['Session frequency'] + X['Dur. (ms)'] + X['Total_volume (Bytes)']).pow(2)) - ((2**2)))
X['Engagement_score'] = X['Engagement_score'].fillna(0)                            

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Engagement_score'] =np.sqrt(((X['Session frequency'] + X['Dur. (ms)'] + X['Total_volume (Bytes)']).pow(2)) - ((2**2)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Engagement_score'] = X['Engagement_score'].fillna(0)


In [33]:
X['Engagement_score']

0         20.416424
1         16.212239
2         14.522097
3         16.421254
4         12.360183
            ...    
149996     0.000000
149997     0.000000
149998     0.000000
149999     0.000000
150000    12.097678
Name: Engagement_score, Length: 150001, dtype: float64

In [34]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Session frequency     150001 non-null  float64
 1   Dur. (ms)             150001 non-null  float64
 2   Total_volume (Bytes)  150001 non-null  float64
 3   Engagement_labels     150001 non-null  int32  
 4   Engagement_score      150001 non-null  float64
dtypes: float64(4), int32(1)
memory usage: 5.1 MB


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 60 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Bearer Id                                 150001 non-null  float64
 1   Start                                     150001 non-null  object 
 2   Start ms                                  150001 non-null  float64
 3   End                                       150001 non-null  object 
 4   End ms                                    150001 non-null  float64
 5   Dur. (ms)                                 150001 non-null  float64
 6   IMSI                                      150001 non-null  float64
 7   MSISDN/Number                             150001 non-null  float64
 8   IMEI                                      150001 non-null  float64
 9   Last Location Name                        150001 non-null  object 
 10  Avg RTT DL (ms)     

In [36]:
Y

Unnamed: 0,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes),Experience_labels
0,-0.348097,-0.417691,-0.14053,1
1,-0.348097,-0.417691,-0.14053,1
2,-0.348097,-0.417691,-0.14053,1
3,-0.348097,-0.417691,-0.14053,1
4,-0.348097,-0.417691,-0.14053,1
...,...,...,...,...
149996,-0.348097,-0.417691,-0.14053,1
149997,-0.348097,-0.417691,-0.14053,1
149998,-0.348097,-0.417691,-0.14053,1
149999,-0.348097,-0.417691,-0.14053,1


In [37]:
Y['Experience_score'] =np.sqrt(((Y['AVG RTT (ms)'] + Y['Avg Bearer TP (kbps)'] + Y['TCP Retrans. Vol (Bytes)']).pow(2)) - ((2**2)))
Y['Experience_score'] = Y['Experience_score'].fillna(0)

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y['Experience_score'] =np.sqrt(((Y['AVG RTT (ms)'] + Y['Avg Bearer TP (kbps)'] + Y['TCP Retrans. Vol (Bytes)']).pow(2)) - ((2**2)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y['Experience_score'] = Y['Experience_score'].fillna(0)


In [38]:
Y

Unnamed: 0,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes),Experience_labels,Experience_score
0,-0.348097,-0.417691,-0.14053,1,0.0
1,-0.348097,-0.417691,-0.14053,1,0.0
2,-0.348097,-0.417691,-0.14053,1,0.0
3,-0.348097,-0.417691,-0.14053,1,0.0
4,-0.348097,-0.417691,-0.14053,1,0.0
...,...,...,...,...,...
149996,-0.348097,-0.417691,-0.14053,1,0.0
149997,-0.348097,-0.417691,-0.14053,1,0.0
149998,-0.348097,-0.417691,-0.14053,1,0.0
149999,-0.348097,-0.417691,-0.14053,1,0.0


In [39]:
Y['Experience_score'].value_counts()

0.000000     131753
1.196708        452
0.082222        303
3.306900        204
3.858948        141
              ...  
0.186064          1
11.622474         1
6.098943          1
5.920102          1
2.563165          1
Name: Experience_score, Length: 13108, dtype: int64

In [40]:
new_df= pd.DataFrame()

In [41]:
new_df['MSISDN/Number'] = df['MSISDN/Number']

In [42]:
new_df = new_df.join(X)
new_df = new_df.join(Y)

In [43]:
new_df['Experience_score'].value_counts()

0.000000     131753
1.196708        452
0.082222        303
3.306900        204
3.858948        141
              ...  
0.186064          1
11.622474         1
6.098943          1
5.920102          1
2.563165          1
Name: Experience_score, Length: 13108, dtype: int64

In [44]:
new_df['Satisfaction_score'] = ((new_df['Engagement_score'] + new_df['Experience_score'])/2)+1
new_df['Satisfaction_score'] = new_df['Satisfaction_score'].fillna(0)

In [45]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 12 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   MSISDN/Number             150001 non-null  float64
 1   Session frequency         150001 non-null  float64
 2   Dur. (ms)                 150001 non-null  float64
 3   Total_volume (Bytes)      150001 non-null  float64
 4   Engagement_labels         150001 non-null  int32  
 5   Engagement_score          150001 non-null  float64
 6   AVG RTT (ms)              150001 non-null  float64
 7   Avg Bearer TP (kbps)      150001 non-null  float64
 8   TCP Retrans. Vol (Bytes)  150001 non-null  float64
 9   Experience_labels         150001 non-null  int32  
 10  Experience_score          150001 non-null  float64
 11  Satisfaction_score        150001 non-null  float64
dtypes: float64(10), int32(2)
memory usage: 12.6 MB


In [46]:
new_df

Unnamed: 0,MSISDN/Number,Session frequency,Dur. (ms),Total_volume (Bytes),Engagement_labels,Engagement_score,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes),Experience_labels,Experience_score,Satisfaction_score
0,3.366496e+10,-0.084545,21.213047,-0.614351,1,20.416424,-0.348097,-0.417691,-0.14053,1,0.0,11.208212
1,3.368185e+10,-0.084545,15.554551,0.865130,0,16.212239,-0.348097,-0.417691,-0.14053,1,0.0,9.106119
2,3.376063e+10,-0.084545,15.513311,-0.769595,1,14.522097,-0.348097,-0.417691,-0.14053,1,0.0,8.261048
3,3.375034e+10,-0.084545,15.016588,1.610557,0,16.421254,-0.348097,-0.417691,-0.14053,1,0.0,9.210627
4,3.369980e+10,-0.084545,12.147531,0.457962,0,12.360183,-0.348097,-0.417691,-0.14053,1,0.0,7.180091
...,...,...,...,...,...,...,...,...,...,...,...,...
149996,3.365069e+10,-0.084545,-0.288492,0.556671,0,0.000000,-0.348097,-0.417691,-0.14053,1,0.0,1.000000
149997,3.366345e+10,-0.084545,-0.081920,0.859396,0,0.000000,-0.348097,-0.417691,-0.14053,1,0.0,1.000000
149998,3.362189e+10,-0.072074,-0.078477,0.539872,0,0.000000,-0.348097,-0.417691,-0.14053,1,0.0,1.000000
149999,3.361962e+10,-0.072074,-0.082660,-0.385639,1,0.000000,-0.348097,-0.417691,-0.14053,1,0.0,1.000000


In [47]:
new_df['Satisfaction_score'].sort_values(ascending=False)[:10]

463       14.106199
50001     13.843121
135480    13.406864
70122     12.827887
125001    12.549484
9704      12.375299
43348     12.360667
91011     12.067298
6395      11.783479
59443     11.766921
Name: Satisfaction_score, dtype: float64

In [48]:
new_df.sort_values(ascending=False, by='Satisfaction_score')[:10]

Unnamed: 0,MSISDN/Number,Session frequency,Dur. (ms),Total_volume (Bytes),Engagement_labels,Engagement_score,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes),Experience_labels,Experience_score,Satisfaction_score
463,41882820000.0,12.261884,-0.224892,-1.384405,2,10.463156,-0.348097,3.450083,12.773738,0,15.749241,14.106199
50001,33698700000.0,-0.047132,15.768083,-0.108018,1,15.484304,6.062221,0.712064,3.621845,2,10.201937,13.843121
135480,41882820000.0,12.261884,-0.225743,1.562755,2,13.451021,2.407136,-0.417691,9.547934,2,11.362708,13.406864
70122,41882820000.0,12.261884,-0.396727,-1.104037,2,10.573633,2.88891,-0.425498,10.770726,2,13.082141,12.827887
125001,33625780000.0,-0.084545,21.653278,1.616657,0,23.098968,-0.348097,-0.417691,-0.14053,1,0.0,12.549484
9704,33658780000.0,-0.084545,-0.962418,0.473109,0,0.0,4.163083,3.77977,14.895485,0,22.750598,12.375299
43348,33650180000.0,-0.084545,-0.224707,-1.433339,1,0.0,6.073443,3.627775,13.107969,0,22.721334,12.360667
91011,41882820000.0,12.261884,-0.895782,1.734942,2,12.947485,6.032594,3.510223,-0.14053,2,9.18711,12.067298
6395,33664300000.0,-0.084545,-0.75803,-1.072254,1,0.0,2.88891,3.748714,15.02187,0,21.566958,11.783479
59443,33659380000.0,-0.072074,-0.217907,1.45458,0,0.0,5.642072,2.41713,13.567318,2,21.533842,11.766921


In [49]:
y = new_df['Satisfaction_score']

In [50]:
cols_drop = ['MSISDN/Number','Satisfaction_score']
X = new_df.drop(cols_drop,axis=1)

In [51]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Session frequency         150001 non-null  float64
 1   Dur. (ms)                 150001 non-null  float64
 2   Total_volume (Bytes)      150001 non-null  float64
 3   Engagement_labels         150001 non-null  int32  
 4   Engagement_score          150001 non-null  float64
 5   AVG RTT (ms)              150001 non-null  float64
 6   Avg Bearer TP (kbps)      150001 non-null  float64
 7   TCP Retrans. Vol (Bytes)  150001 non-null  float64
 8   Experience_labels         150001 non-null  int32  
 9   Experience_score          150001 non-null  float64
dtypes: float64(8), int32(2)
memory usage: 10.3 MB


In [52]:
y.value_counts()

1.000000    114921
1.598354       384
1.041111       261
2.653450       195
2.929474       140
             ...  
1.191943         1
7.847260         1
1.593370         1
1.684916         1
7.048839         1
Name: Satisfaction_score, Length: 30609, dtype: int64

In [53]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42, test_size= 0.25)

In [54]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(112500, 10)
(37501, 10)
(112500,)
(37501,)


In [55]:
lin_reg = LinearRegression()


In [56]:
lin_reg.fit(X_train,y_train)
print(lin_reg.intercept_)
print(lin_reg.coef_)

1.0000000000000009
[ 4.72236906e-13 -2.08721929e-14 -9.27209698e-16 -4.31599201e-15
  5.00000000e-01  7.42461648e-16  3.49113100e-16  1.42615954e-15
 -4.90926744e-16  5.00000000e-01]


In [57]:
model = lin_reg.predict(X_test)

In [58]:
print(f"Root mean squared error {np.sqrt(mean_squared_error(y_test,model))}")
print(f"Mean squared error {mean_squared_error(y_test,model)}")
print(f"Mean Absolute error {mean_absolute_error(y_test,model)}")

Root mean squared error 4.670735238805474e-13
Mean squared error 2.181576767101923e-25
Mean Absolute error 7.662227548117484e-14


In [59]:
score_check = pd.DataFrame()

In [60]:
score_check['y_test'] = y_test

In [61]:
score_check['model'] = model

In [62]:
score_check.head(50)

Unnamed: 0,y_test,model
16311,1.0,1.0
102956,1.0,1.0
21093,6.598262,6.598262
48936,1.0,1.0
9774,1.0,1.0
8563,1.512868,1.512868
41663,1.0,1.0
7416,1.0,1.0
21286,1.0,1.0
55010,1.0,1.0


In [63]:
metrics = new_df[['Engagement_score','Experience_score']]

In [64]:
met_cols = ['Engagement_score','Experience_score']

In [65]:
metrics[met_cols] = scaler.fit_transform(metrics[met_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [66]:
Kmeans_2 = KMeans(
    init= "random",
    n_clusters = 2,
    n_init= 10,
    max_iter= 300,
    random_state= 42
)

In [67]:
Kmeans_2.fit(metrics)

KMeans(init='random', n_clusters=2, random_state=42)

In [68]:
metrics

Unnamed: 0,Engagement_score,Experience_score
0,16.997855,-0.276332
1,13.450663,-0.276332
2,12.024642,-0.276332
3,13.627015,-0.276332
4,10.200573,-0.276332
...,...,...
149996,-0.228068,-0.276332
149997,-0.228068,-0.276332
149998,-0.228068,-0.276332
149999,-0.228068,-0.276332


In [69]:
metrics['cluster'] = Kmeans_2.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metrics['cluster'] = Kmeans_2.labels_


In [70]:
metrics['Satisfaction_score'] = new_df['Satisfaction_score']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metrics['Satisfaction_score'] = new_df['Satisfaction_score']


In [71]:
satisfied_per_cluster = metrics.groupby('cluster').agg({'Satisfaction_score':["median","mean"]})

In [72]:
satisfied_per_cluster

Unnamed: 0_level_0,Satisfaction_score,Satisfaction_score
Unnamed: 0_level_1,median,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2
0,1.0,1.289093
1,6.389934,6.175635


In [73]:
experience_per_cluster = metrics.groupby('cluster').agg({'Experience_score':["median","mean"]})


In [74]:
experience_per_cluster

Unnamed: 0_level_0,Experience_score,Experience_score
Unnamed: 0_level_1,median,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2
0,-0.276332,-2.4e-05
1,-0.276332,0.002222


In [75]:
new_df

Unnamed: 0,MSISDN/Number,Session frequency,Dur. (ms),Total_volume (Bytes),Engagement_labels,Engagement_score,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes),Experience_labels,Experience_score,Satisfaction_score
0,3.366496e+10,-0.084545,21.213047,-0.614351,1,20.416424,-0.348097,-0.417691,-0.14053,1,0.0,11.208212
1,3.368185e+10,-0.084545,15.554551,0.865130,0,16.212239,-0.348097,-0.417691,-0.14053,1,0.0,9.106119
2,3.376063e+10,-0.084545,15.513311,-0.769595,1,14.522097,-0.348097,-0.417691,-0.14053,1,0.0,8.261048
3,3.375034e+10,-0.084545,15.016588,1.610557,0,16.421254,-0.348097,-0.417691,-0.14053,1,0.0,9.210627
4,3.369980e+10,-0.084545,12.147531,0.457962,0,12.360183,-0.348097,-0.417691,-0.14053,1,0.0,7.180091
...,...,...,...,...,...,...,...,...,...,...,...,...
149996,3.365069e+10,-0.084545,-0.288492,0.556671,0,0.000000,-0.348097,-0.417691,-0.14053,1,0.0,1.000000
149997,3.366345e+10,-0.084545,-0.081920,0.859396,0,0.000000,-0.348097,-0.417691,-0.14053,1,0.0,1.000000
149998,3.362189e+10,-0.072074,-0.078477,0.539872,0,0.000000,-0.348097,-0.417691,-0.14053,1,0.0,1.000000
149999,3.361962e+10,-0.072074,-0.082660,-0.385639,1,0.000000,-0.348097,-0.417691,-0.14053,1,0.0,1.000000


In [76]:
metrics

Unnamed: 0,Engagement_score,Experience_score,cluster,Satisfaction_score
0,16.997855,-0.276332,1,11.208212
1,13.450663,-0.276332,1,9.106119
2,12.024642,-0.276332,1,8.261048
3,13.627015,-0.276332,1,9.210627
4,10.200573,-0.276332,1,7.180091
...,...,...,...,...
149996,-0.228068,-0.276332,0,1.000000
149997,-0.228068,-0.276332,0,1.000000
149998,-0.228068,-0.276332,0,1.000000
149999,-0.228068,-0.276332,0,1.000000


In [77]:
sql_df = pd.DataFrame()

In [78]:
sql_df['MSISDN/Number'] = new_df['MSISDN/Number']

In [79]:
cols_sql_df = ['Engagement_score','Experience_score','Satisfaction_score']
sql_df[cols_sql_df] = metrics[cols_sql_df]

In [80]:
sql_df

Unnamed: 0,MSISDN/Number,Engagement_score,Experience_score,Satisfaction_score
0,3.366496e+10,16.997855,-0.276332,11.208212
1,3.368185e+10,13.450663,-0.276332,9.106119
2,3.376063e+10,12.024642,-0.276332,8.261048
3,3.375034e+10,13.627015,-0.276332,9.210627
4,3.369980e+10,10.200573,-0.276332,7.180091
...,...,...,...,...
149996,3.365069e+10,-0.228068,-0.276332,1.000000
149997,3.366345e+10,-0.228068,-0.276332,1.000000
149998,3.362189e+10,-0.228068,-0.276332,1.000000
149999,3.361962e+10,-0.228068,-0.276332,1.000000


In [81]:
from sqlalchemy import create_engine
engine = create_engine('sqlite://',echo = False)

In [82]:
#credentials to database connections
hostname = 'localhost'
dbname = "mydb_name"
uname = "my_user_name"
pwd = "my_password"

In [83]:
#engine = create_engine("my_sql+pymysql://{user}:{pw}@{host}/{db}".format(host=hostname, db=dbname, user=uname, pw=pwd))

In [84]:
sql_df.to_sql('users', con=engine)
engine.execute("SELECT * FROM users").fetchall()

[(0, 33664962239.0, 16.99785521747741, -0.27633236955646856, 11.208212177174996),
 (1, 33681854413.0, 13.45066309914432, -0.27633236955646856, 9.106119338168021),
 (2, 33760627129.0, 12.024641901368975, -0.27633236955646856, 8.26104840653651),
 (3, 33750343200.0, 13.627015368006402, -0.27633236955646856, 9.210627024572055),
 (4, 33699795932.0, 10.200573025700134, -0.27633236955646856, 7.180091414784727),
 (5, 33668185951.0, 10.668065146983107, -0.27633236955646856, 7.4571307765272525),
 (6, 33665368271.0, 7.9146425317074325, -0.27633236955646856, 5.82543173986759),
 (7, 33763490140.0, 9.539599273263374, -0.27633236955646856, 6.788393377176939),
 (8, 33698743617.0, 9.682122665394783, -0.27633236955646856, 6.872853815742416),
 (9, 33659219748.0, 8.757422052172195, -0.27633236955646856, 6.324869248671942),
 (10, 33665646348.0, 7.809703869016504, -0.27633236955646856, 5.763244295822065),
 (11, 33664473872.0, 7.956117756745435, 3.3973373409884844, 8.600286935478872),
 (12, 33603291937.0, 8.

In [86]:
#conn = sql.connect('sql_df.db')
#sql_df.to_sql('sql_df', conn)

ValueError: Table 'sql_df' already exists.

In [91]:
conn = sql.connect('sql_df.db')
sql_db = pd.read_sql('SELECT * FROM sql_df WHERE Satisfaction_score == 1', conn)

In [92]:
sql_db

Unnamed: 0,index,MSISDN/Number,Engagement_score,Experience_score,Satisfaction_score
0,63,3.369943e+10,-0.228068,-0.276332,1.0
1,64,3.365951e+10,-0.228068,-0.276332,1.0
2,67,3.365079e+10,-0.228068,-0.276332,1.0
3,70,3.368243e+10,-0.228068,-0.276332,1.0
4,73,3.368451e+10,-0.228068,-0.276332,1.0
...,...,...,...,...,...
114916,149994,3.364566e+10,-0.228068,-0.276332,1.0
114917,149996,3.365069e+10,-0.228068,-0.276332,1.0
114918,149997,3.366345e+10,-0.228068,-0.276332,1.0
114919,149998,3.362189e+10,-0.228068,-0.276332,1.0
