In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from kneed import KneeLocator
from numpy.linalg import norm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score
import sqlite3 as sql

In [2]:
df = pd.read_csv('telco.csv')

In [3]:
df = df[df['MSISDN/Number'].notna()]

In [4]:
df = df[df['Bearer Id'].notna()]

In [5]:
df = df.reset_index()

In [6]:
num_cols = []
categorical_cols = []

In [7]:
def sort_cols(data):


    for i in data.columns:
        if data[i].dtypes == "object":
            categorical_cols.append(i)
        else :
            num_cols.append(i)

In [8]:
sort_cols(df)

In [9]:
def fix_cat_cols(data,cat_columns):
    for i in cat_columns:
        data[i] = data[i].fillna(data[i].mode()[0])

In [10]:
fix_cat_cols(df,categorical_cols)

In [11]:
def fix_num_cols(data,num_columns):
    for i in num_columns:
        data[i] = data[i].fillna(data[i].mean())

In [12]:
fix_num_cols(df,num_cols)

# Task 4

In [13]:
df['Total_volume (Bytes)'] = df['Total DL (Bytes)'] + df['Total UL (Bytes)']

In [14]:
df['Session frequency'] = df.groupby(['Bearer Id'])['Dur. (ms)'].transform('count')

In [15]:
df['AVG RTT (ms)'] = df['Avg RTT DL (ms)'] + df['Avg RTT UL (ms)']
df['Avg Bearer TP (kbps)'] = df['Avg Bearer TP DL (kbps)'] + df['Avg Bearer TP UL (kbps)']
df['TCP Retrans. Vol (Bytes)'] = df['TCP DL Retrans. Vol (Bytes)'] + df['TCP UL Retrans. Vol (Bytes)']

In [16]:
needed_cols = ['Avg RTT DL (ms)','Avg RTT UL (ms)','Avg Bearer TP DL (kbps)','Avg Bearer TP UL (kbps)','TCP DL Retrans. Vol (Bytes)','TCP UL Retrans. Vol (Bytes)',"Dur. (ms)",'Total_volume (Bytes)']

In [17]:
for _ in needed_cols:
    Q1 = df[_].quantile(0.05)
    Q3 = df[_].quantile(0.95)

    df[_] = np.where(df[_] < Q1, df[_].mean(), df[_] )
    df[_] = np.where(df[_] > Q3, df[_].mean(), df[_] )
    print(df[_].skew())

0.7443364651558915
0.8871263872840707
1.5683406246562437
2.829296077083227
-0.87730823412512
-1.047936833142215
0.6267722550599918
-0.01827735734167915


In [18]:
X = df[['Session frequency','Dur. (ms)','Total_volume (Bytes)']]

In [19]:
df.describe()

Unnamed: 0,index,Bearer Id,Start ms,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Avg RTT DL (ms),Avg RTT UL (ms),...,Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes),Total_volume (Bytes),Session frequency,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes)
count,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,...,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0
mean,75154.149597,1.014676e+19,499.205143,498.640196,97737.305853,208201600000000.0,41906540000.0,48500680000000.0,70.464481,11.469813,...,8289600.0,421152500.0,8262685.0,41119370.0,454664000.0,496809500.0,1.240327,123.828174,15058.119901,21777860.0
std,43278.817266,2.893262e+18,288.563193,288.070129,45516.105336,15243470000.0,2450976000000.0,22432170000000.0,37.578415,9.157564,...,4781967.0,243163900.0,4768806.0,11278440.0,244179100.0,208436200.0,0.638478,535.706951,26855.062122,118369000.0
min,0.0,6.917538e+18,0.0,0.0,20184.0,204047100000000.0,33601000000.0,440015200000.0,23.0,1.0,...,59.0,3290.0,148.0,2866892.0,7114041.0,115793700.0,1.0,0.0,0.0,86.0
25%,37802.25,7.349883e+18,250.0,251.0,74224.5,208201400000000.0,33651310000.0,35460710000000.0,37.0,4.0,...,4130680.0,210304200.0,4144179.0,33218280.0,243126300.0,326754100.0,1.0,40.0,90.0,1571432.0
50%,75063.5,7.349883e+18,499.0,500.0,86400.0,208201500000000.0,33663710000.0,35722310000000.0,59.0,9.0,...,8291650.0,421861800.0,8266016.0,41141960.0,455940800.0,496922500.0,1.0,67.0,124.0,21777860.0
75%,112689.75,1.304243e+19,749.0,749.0,116681.25,208201800000000.0,33683520000.0,86119700000000.0,106.199942,17.628232,...,12432260.0,631633800.0,12380490.0,49032520.0,665754200.0,664250500.0,1.0,123.828174,22335.75,21777860.0
max,149999.0,1.318654e+19,999.0,999.0,241989.0,208252200000000.0,882397100000000.0,99001200000000.0,220.0,44.0,...,16558790.0,843442500.0,16558820.0,78331310.0,902969600.0,875314800.0,16.0,96924.0,382262.0,4344116000.0


In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
col_names = ["Session frequency","Dur. (ms)",'Total_volume (Bytes)']
X[col_names] = scaler.fit_transform(X[col_names])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [21]:
kmeans = KMeans(
    init= "random",
    n_clusters = 3,
    n_init= 10,
    max_iter= 300,
    random_state= 42
)

In [22]:
kmeans.fit(X)

KMeans(init='random', n_clusters=3, random_state=42)

In [23]:
X['Engagement_labels'] = kmeans.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Engagement_labels'] = kmeans.labels_


In [24]:
X

Unnamed: 0,Session frequency,Dur. (ms),Total_volume (Bytes),Engagement_labels
0,-0.376407,0.259964,-0.725309,0
1,-0.376407,0.259964,1.009309,0
2,-0.376407,0.259964,-0.907324,0
3,-0.376407,0.259964,0.091930,0
4,-0.376407,0.259964,0.531924,0
...,...,...,...,...
148501,-0.376407,-0.838174,1.803687,0
148502,-0.376407,-0.362671,0.647657,0
148503,-0.376407,0.005112,1.002586,0
148504,1.189822,0.011242,0.627960,2


In [25]:
X.describe()

Unnamed: 0,Session frequency,Dur. (ms),Total_volume (Bytes),Engagement_labels
count,148506.0,148506.0,148506.0,148506.0
mean,-2.053553e-16,-1.707148e-16,2.571247e-16,0.55292
std,1.000003,1.000003,1.000003,0.756484
min,-0.3764072,-1.703871,-1.827979,0.0
25%,-0.3764072,-0.5165839,-0.8158658,0.0
50%,-0.3764072,-0.2490842,0.0005423917,0.0
75%,-0.3764072,0.4162045,0.8033231,1.0
max,23.11703,3.169256,1.815935,2.0


In [26]:
X1 = X[X.Engagement_labels==1]
X0 = X[X.Engagement_labels==0]
X2 = X[X.Engagement_labels==2]

In [27]:
needed_cols = ['Avg RTT DL (ms)','Avg RTT UL (ms)','Avg Bearer TP DL (kbps)','Avg Bearer TP UL (kbps)','TCP DL Retrans. Vol (Bytes)','TCP UL Retrans. Vol (Bytes)']

In [28]:
df.describe()

Unnamed: 0,index,Bearer Id,Start ms,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Avg RTT DL (ms),Avg RTT UL (ms),...,Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes),Total_volume (Bytes),Session frequency,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes)
count,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,...,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0,148506.0
mean,75154.149597,1.014676e+19,499.205143,498.640196,97737.305853,208201600000000.0,41906540000.0,48500680000000.0,70.464481,11.469813,...,8289600.0,421152500.0,8262685.0,41119370.0,454664000.0,496809500.0,1.240327,123.828174,15058.119901,21777860.0
std,43278.817266,2.893262e+18,288.563193,288.070129,45516.105336,15243470000.0,2450976000000.0,22432170000000.0,37.578415,9.157564,...,4781967.0,243163900.0,4768806.0,11278440.0,244179100.0,208436200.0,0.638478,535.706951,26855.062122,118369000.0
min,0.0,6.917538e+18,0.0,0.0,20184.0,204047100000000.0,33601000000.0,440015200000.0,23.0,1.0,...,59.0,3290.0,148.0,2866892.0,7114041.0,115793700.0,1.0,0.0,0.0,86.0
25%,37802.25,7.349883e+18,250.0,251.0,74224.5,208201400000000.0,33651310000.0,35460710000000.0,37.0,4.0,...,4130680.0,210304200.0,4144179.0,33218280.0,243126300.0,326754100.0,1.0,40.0,90.0,1571432.0
50%,75063.5,7.349883e+18,499.0,500.0,86400.0,208201500000000.0,33663710000.0,35722310000000.0,59.0,9.0,...,8291650.0,421861800.0,8266016.0,41141960.0,455940800.0,496922500.0,1.0,67.0,124.0,21777860.0
75%,112689.75,1.304243e+19,749.0,749.0,116681.25,208201800000000.0,33683520000.0,86119700000000.0,106.199942,17.628232,...,12432260.0,631633800.0,12380490.0,49032520.0,665754200.0,664250500.0,1.0,123.828174,22335.75,21777860.0
max,149999.0,1.318654e+19,999.0,999.0,241989.0,208252200000000.0,882397100000000.0,99001200000000.0,220.0,44.0,...,16558790.0,843442500.0,16558820.0,78331310.0,902969600.0,875314800.0,16.0,96924.0,382262.0,4344116000.0


In [29]:
scaler = StandardScaler()
cols_scaled = ['AVG RTT (ms)','Avg Bearer TP (kbps)','TCP Retrans. Vol (Bytes)']

In [30]:
df[cols_scaled] = scaler.fit_transform(df[cols_scaled])


In [31]:
Y = df[['AVG RTT (ms)','Avg Bearer TP (kbps)','TCP Retrans. Vol (Bytes)']]

In [32]:
Y

Unnamed: 0,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes)
0,-1.434150e-01,-0.558225,3.147196e-17
1,-1.004810e-01,-0.559156,3.147196e-17
2,2.652738e-17,-0.560161,3.147196e-17
3,2.652738e-17,-0.557443,3.147196e-17
4,2.652738e-17,-0.560161,3.147196e-17
...,...,...,...
148501,3.699271e-01,1.852725,-4.276810e-02
148502,-1.714155e-01,-0.556363,3.147196e-17
148503,-1.770156e-01,-0.557853,3.147196e-17
148504,-1.396816e-01,-0.557369,3.147196e-17


In [33]:
kmeans.fit(Y)

KMeans(init='random', n_clusters=3, random_state=42)

In [34]:
Y['Experience_labels'] = kmeans.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y['Experience_labels'] = kmeans.labels_


In [35]:
Y['Experience_labels'].value_counts()

0    120768
2     27484
1       254
Name: Experience_labels, dtype: int64

In [36]:
Y1 = Y[Y.Experience_labels==1]
Y0 = Y[Y.Experience_labels==0]
Y2 = Y[Y.Experience_labels==2]

## Task 4.1

In [37]:
X['Engagement_score'] =np.sqrt(((X2['Session frequency'] + X['Dur. (ms)'] + X['Total_volume (Bytes)']).pow(2)) - ((2**2)))
X['Engagement_score'] = X['Engagement_score'].fillna(0)                            

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Engagement_score'] =np.sqrt(((X2['Session frequency'] + X['Dur. (ms)'] + X['Total_volume (Bytes)']).pow(2)) - ((2**2)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Engagement_score'] = X['Engagement_score'].fillna(0)


In [38]:
X['Engagement_score'].value_counts()

0.000000    140357
1.503369        67
1.659565        58
1.659599        21
1.503405        19
             ...  
0.570746         1
4.916402         1
5.096194         1
0.636853         1
1.690472         1
Name: Engagement_score, Length: 7882, dtype: int64

In [39]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148506 entries, 0 to 148505
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Session frequency     148506 non-null  float64
 1   Dur. (ms)             148506 non-null  float64
 2   Total_volume (Bytes)  148506 non-null  float64
 3   Engagement_labels     148506 non-null  int32  
 4   Engagement_score      148506 non-null  float64
dtypes: float64(4), int32(1)
memory usage: 5.1 MB


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148506 entries, 0 to 148505
Data columns (total 61 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   index                                     148506 non-null  int64  
 1   Bearer Id                                 148506 non-null  float64
 2   Start                                     148506 non-null  object 
 3   Start ms                                  148506 non-null  float64
 4   End                                       148506 non-null  object 
 5   End ms                                    148506 non-null  float64
 6   Dur. (ms)                                 148506 non-null  float64
 7   IMSI                                      148506 non-null  float64
 8   MSISDN/Number                             148506 non-null  float64
 9   IMEI                                      148506 non-null  float64
 10  Last Location Name  

In [41]:
Y['Experience_labels'].value_counts()

0    120768
2     27484
1       254
Name: Experience_labels, dtype: int64

In [42]:
Y['Experience_score'] =np.sqrt(((Y2['AVG RTT (ms)'] + Y2['Avg Bearer TP (kbps)'] + Y2['TCP Retrans. Vol (Bytes)']).pow(2)) - ((2**2)))
Y['Experience_score'] = Y['Experience_score'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y['Experience_score'] =np.sqrt(((Y2['AVG RTT (ms)'] + Y2['Avg Bearer TP (kbps)'] + Y2['TCP Retrans. Vol (Bytes)']).pow(2)) - ((2**2)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y['Experience_score'] = Y['Experience_score'].fillna(0)


In [43]:
Y

Unnamed: 0,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes),Experience_labels,Experience_score
0,-1.434150e-01,-0.558225,3.147196e-17,0,0.000000
1,-1.004810e-01,-0.559156,3.147196e-17,0,0.000000
2,2.652738e-17,-0.560161,3.147196e-17,0,0.000000
3,2.652738e-17,-0.557443,3.147196e-17,0,0.000000
4,2.652738e-17,-0.560161,3.147196e-17,0,0.000000
...,...,...,...,...,...
148501,3.699271e-01,1.852725,-4.276810e-02,2,0.867118
148502,-1.714155e-01,-0.556363,3.147196e-17,0,0.000000
148503,-1.770156e-01,-0.557853,3.147196e-17,0,0.000000
148504,-1.396816e-01,-0.557369,3.147196e-17,0,0.000000


In [44]:
Y['Experience_score'].value_counts()

0.000000    141059
2.997878         1
3.989228         1
1.963770         1
0.829829         1
             ...  
1.847142         1
1.797679         1
3.326823         1
1.412151         1
0.867118         1
Name: Experience_score, Length: 7448, dtype: int64

In [45]:
new_df= pd.DataFrame()

In [46]:
new_df['MSISDN/Number'] = df['MSISDN/Number']

In [47]:
new_df = new_df.join(X)
new_df = new_df.join(Y)

In [48]:
new_df['Experience_score'].value_counts()

0.000000    141059
2.997878         1
3.989228         1
1.963770         1
0.829829         1
             ...  
1.847142         1
1.797679         1
3.326823         1
1.412151         1
0.867118         1
Name: Experience_score, Length: 7448, dtype: int64

## Task 4.2

In [49]:
new_df['Satisfaction_score'] = ((new_df['Engagement_score'] + new_df['Experience_score'])/2)+1
new_df['Satisfaction_score'] = new_df['Satisfaction_score'].fillna(0)

In [50]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148506 entries, 0 to 148505
Data columns (total 12 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   MSISDN/Number             148506 non-null  float64
 1   Session frequency         148506 non-null  float64
 2   Dur. (ms)                 148506 non-null  float64
 3   Total_volume (Bytes)      148506 non-null  float64
 4   Engagement_labels         148506 non-null  int32  
 5   Engagement_score          148506 non-null  float64
 6   AVG RTT (ms)              148506 non-null  float64
 7   Avg Bearer TP (kbps)      148506 non-null  float64
 8   TCP Retrans. Vol (Bytes)  148506 non-null  float64
 9   Experience_labels         148506 non-null  int32  
 10  Experience_score          148506 non-null  float64
 11  Satisfaction_score        148506 non-null  float64
dtypes: float64(10), int32(2)
memory usage: 12.5 MB


In [51]:
new_df

Unnamed: 0,MSISDN/Number,Session frequency,Dur. (ms),Total_volume (Bytes),Engagement_labels,Engagement_score,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes),Experience_labels,Experience_score,Satisfaction_score
0,3.366496e+10,-0.376407,0.259964,-0.725309,0,0.0,-1.434150e-01,-0.558225,3.147196e-17,0,0.000000,1.000000
1,3.368185e+10,-0.376407,0.259964,1.009309,0,0.0,-1.004810e-01,-0.559156,3.147196e-17,0,0.000000,1.000000
2,3.376063e+10,-0.376407,0.259964,-0.907324,0,0.0,2.652738e-17,-0.560161,3.147196e-17,0,0.000000,1.000000
3,3.375034e+10,-0.376407,0.259964,0.091930,0,0.0,2.652738e-17,-0.557443,3.147196e-17,0,0.000000,1.000000
4,3.369980e+10,-0.376407,0.259964,0.531924,0,0.0,2.652738e-17,-0.560161,3.147196e-17,0,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
148501,3.366865e+10,-0.376407,-0.838174,1.803687,0,0.0,3.699271e-01,1.852725,-4.276810e-02,2,0.867118,1.433559
148502,3.365069e+10,-0.376407,-0.362671,0.647657,0,0.0,-1.714155e-01,-0.556363,3.147196e-17,0,0.000000,1.000000
148503,3.366345e+10,-0.376407,0.005112,1.002586,0,0.0,-1.770156e-01,-0.557853,3.147196e-17,0,0.000000,1.000000
148504,3.362189e+10,1.189822,0.011242,0.627960,2,0.0,-1.396816e-01,-0.557369,3.147196e-17,0,0.000000,1.000000


In [52]:
new_df['Satisfaction_score'].sort_values(ascending=False)[:10]

96280     17.551566
146882    14.364937
146874    13.918697
146883    13.178657
146879    12.773485
146876    12.483514
146871    12.483439
146881    12.436291
146877    12.387679
146880    12.348342
Name: Satisfaction_score, dtype: float64

In [53]:
new_df.sort_values(ascending=False, by='Satisfaction_score')[:10]

Unnamed: 0,MSISDN/Number,Session frequency,Dur. (ms),Total_volume (Bytes),Engagement_labels,Engagement_score,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes),Experience_labels,Experience_score,Satisfaction_score
96280,33651290000.0,-0.376407,-1.42331,-1.82184,0,0.0,30.701911,2.516699,-0.055115,2,33.103133,17.551566
146882,33661500000.0,23.117033,-0.63273,1.708955,2,24.110448,0.200057,3.225432,-0.129826,2,2.619427,14.364937
146874,33761680000.0,23.117033,-0.249106,-1.293938,2,21.481085,-0.023946,4.697897,0.119527,2,4.356309,13.918697
146883,33699460000.0,23.117033,-0.249106,0.973035,2,23.756924,0.045122,2.147382,-0.10433,2,0.600391,13.178657
146879,33667540000.0,23.117033,-0.249084,0.763805,2,23.54697,0.065655,0.256633,-0.150647,0,0.0,12.773485
146876,33762180000.0,23.117033,-0.249106,0.186019,2,22.967029,2.268359,-0.029384,-0.161037,0,0.0,12.483514
146871,33664850000.0,23.117033,-0.249084,0.185847,2,22.966878,0.041388,0.434738,0.691873,0,0.0,12.483439
146881,33659010000.0,23.117033,-0.249106,0.09193,2,22.872582,-0.048213,1.213402,-0.170335,2,0.0,12.436291
146877,33665160000.0,23.117033,-0.249106,-0.004923,2,22.775358,-0.078081,-0.254482,-0.177382,0,0.0,12.387679
146880,33669160000.0,23.117033,-0.249106,-0.083294,2,22.696684,-0.057547,0.940641,-0.151666,2,0.0,12.348342


## Task 4.3

In [54]:
y = new_df['Satisfaction_score']

In [55]:
cols_drop = ['MSISDN/Number','Satisfaction_score','Engagement_labels','Experience_labels','Experience_score','Engagement_score']
X = new_df.drop(cols_drop,axis=1)

In [56]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148506 entries, 0 to 148505
Data columns (total 6 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Session frequency         148506 non-null  float64
 1   Dur. (ms)                 148506 non-null  float64
 2   Total_volume (Bytes)      148506 non-null  float64
 3   AVG RTT (ms)              148506 non-null  float64
 4   Avg Bearer TP (kbps)      148506 non-null  float64
 5   TCP Retrans. Vol (Bytes)  148506 non-null  float64
dtypes: float64(6)
memory usage: 6.8 MB


In [57]:
y.value_counts()

1.000000    133482
1.751684        60
1.829782        44
1.829800        18
2.124327        17
             ...  
2.988382         1
1.894743         1
1.791199         1
1.456177         1
1.433559         1
Name: Satisfaction_score, Length: 14800, dtype: int64

In [58]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42, test_size= 0.25)

In [59]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(111379, 6)
(37127, 6)
(111379,)
(37127,)


In [60]:
lin_reg = LinearRegression()


In [61]:
lin_reg.fit(X_train,y_train)
print(lin_reg.intercept_)
print(lin_reg.coef_)

1.131965887060357
[0.30119396 0.05241914 0.04471233 0.01225596 0.21358481 0.02764595]


In [62]:
model = lin_reg.predict(X_test)

In [63]:
print(f"Root mean squared error {np.sqrt(mean_squared_error(y_test,model))}")
print(f"Mean squared error {mean_squared_error(y_test,model)}")
print(f"Mean Absolute error {mean_absolute_error(y_test,model)}")

Root mean squared error 0.3469708753261476
Mean squared error 0.12038878832459307
Mean Absolute error 0.20306360385678712


In [64]:
score_check = pd.DataFrame()

In [65]:
score_check['y_test'] = y_test

In [66]:
score_check['model'] = model

In [67]:
score_check

Unnamed: 0,y_test,model
63312,1.000000,0.938103
32540,1.000000,0.874447
32971,1.000000,0.980180
6396,1.000000,0.964886
43350,1.000000,0.890321
...,...,...
136000,1.000000,0.876281
139153,1.000000,0.974480
29881,1.000000,0.967792
59861,2.164552,1.656367


## Task 4.4

In [68]:
metrics = new_df[['Engagement_score','Experience_score']]

In [69]:
met_cols = ['Engagement_score','Experience_score']

In [70]:
metrics[met_cols] = scaler.fit_transform(metrics[met_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [71]:
Kmeans_2 = KMeans(
    init= "random",
    n_clusters = 2,
    n_init= 10,
    max_iter= 300,
    random_state= 42
)

In [72]:
Kmeans_2.fit(metrics)

KMeans(init='random', n_clusters=2, random_state=42)

In [73]:
metrics

Unnamed: 0,Engagement_score,Experience_score
0,-0.184344,-0.179326
1,-0.184344,-0.179326
2,-0.184344,-0.179326
3,-0.184344,-0.179326
4,-0.184344,-0.179326
...,...,...
148501,-0.184344,1.136605
148502,-0.184344,-0.179326
148503,-0.184344,-0.179326
148504,-0.184344,-0.179326


In [74]:
metrics['cluster'] = Kmeans_2.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metrics['cluster'] = Kmeans_2.labels_


## Task 4.5

In [75]:
metrics['Satisfaction_score'] = new_df['Satisfaction_score']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metrics['Satisfaction_score'] = new_df['Satisfaction_score']


In [76]:
satisfied_per_cluster = metrics.groupby('cluster').agg({'Satisfaction_score':["median","mean"]})

In [77]:
satisfied_per_cluster

Unnamed: 0_level_0,Satisfaction_score,Satisfaction_score
Unnamed: 0_level_1,median,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2
0,1.0,1.083597
1,2.445868,2.781809


In [78]:
experience_per_cluster = metrics.groupby('cluster').agg({'Experience_score':["median","mean"]})


In [79]:
experience_per_cluster

Unnamed: 0_level_0,Experience_score,Experience_score
Unnamed: 0_level_1,median,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2
0,-0.179326,-0.144941
1,4.030543,4.917293


In [80]:
new_df

Unnamed: 0,MSISDN/Number,Session frequency,Dur. (ms),Total_volume (Bytes),Engagement_labels,Engagement_score,AVG RTT (ms),Avg Bearer TP (kbps),TCP Retrans. Vol (Bytes),Experience_labels,Experience_score,Satisfaction_score
0,3.366496e+10,-0.376407,0.259964,-0.725309,0,0.0,-1.434150e-01,-0.558225,3.147196e-17,0,0.000000,1.000000
1,3.368185e+10,-0.376407,0.259964,1.009309,0,0.0,-1.004810e-01,-0.559156,3.147196e-17,0,0.000000,1.000000
2,3.376063e+10,-0.376407,0.259964,-0.907324,0,0.0,2.652738e-17,-0.560161,3.147196e-17,0,0.000000,1.000000
3,3.375034e+10,-0.376407,0.259964,0.091930,0,0.0,2.652738e-17,-0.557443,3.147196e-17,0,0.000000,1.000000
4,3.369980e+10,-0.376407,0.259964,0.531924,0,0.0,2.652738e-17,-0.560161,3.147196e-17,0,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
148501,3.366865e+10,-0.376407,-0.838174,1.803687,0,0.0,3.699271e-01,1.852725,-4.276810e-02,2,0.867118,1.433559
148502,3.365069e+10,-0.376407,-0.362671,0.647657,0,0.0,-1.714155e-01,-0.556363,3.147196e-17,0,0.000000,1.000000
148503,3.366345e+10,-0.376407,0.005112,1.002586,0,0.0,-1.770156e-01,-0.557853,3.147196e-17,0,0.000000,1.000000
148504,3.362189e+10,1.189822,0.011242,0.627960,2,0.0,-1.396816e-01,-0.557369,3.147196e-17,0,0.000000,1.000000


In [81]:
metrics

Unnamed: 0,Engagement_score,Experience_score,cluster,Satisfaction_score
0,-0.184344,-0.179326,0,1.000000
1,-0.184344,-0.179326,0,1.000000
2,-0.184344,-0.179326,0,1.000000
3,-0.184344,-0.179326,0,1.000000
4,-0.184344,-0.179326,0,1.000000
...,...,...,...,...
148501,-0.184344,1.136605,0,1.433559
148502,-0.184344,-0.179326,0,1.000000
148503,-0.184344,-0.179326,0,1.000000
148504,-0.184344,-0.179326,0,1.000000


## Task 4.6

In [82]:
sql_df = pd.DataFrame()

In [83]:
sql_df['MSISDN/Number'] = new_df['MSISDN/Number']

In [84]:
cols_sql_df = ['Engagement_score','Experience_score','Satisfaction_score']
sql_df[cols_sql_df] = metrics[cols_sql_df]

In [85]:
sql_df

Unnamed: 0,MSISDN/Number,Engagement_score,Experience_score,Satisfaction_score
0,3.366496e+10,-0.184344,-0.179326,1.000000
1,3.368185e+10,-0.184344,-0.179326,1.000000
2,3.376063e+10,-0.184344,-0.179326,1.000000
3,3.375034e+10,-0.184344,-0.179326,1.000000
4,3.369980e+10,-0.184344,-0.179326,1.000000
...,...,...,...,...
148501,3.366865e+10,-0.184344,1.136605,1.433559
148502,3.365069e+10,-0.184344,-0.179326,1.000000
148503,3.366345e+10,-0.184344,-0.179326,1.000000
148504,3.362189e+10,-0.184344,-0.179326,1.000000


In [86]:
from sqlalchemy import create_engine
engine = create_engine('sqlite://',echo = False)

In [87]:
#conn = sql.connect('sql_df.db')
#sql_df.to_sql('sql_df', conn)

In [88]:
conn = sql.connect('sql_df.db')
sql_db = pd.read_sql('SELECT * FROM sql_df WHERE Satisfaction_score == 1', conn)

In [89]:
sql_db

Unnamed: 0,index,MSISDN/Number,Engagement_score,Experience_score,Satisfaction_score
0,63,3.369943e+10,-0.228068,-0.276332,1.0
1,64,3.365951e+10,-0.228068,-0.276332,1.0
2,67,3.365079e+10,-0.228068,-0.276332,1.0
3,70,3.368243e+10,-0.228068,-0.276332,1.0
4,73,3.368451e+10,-0.228068,-0.276332,1.0
...,...,...,...,...,...
114916,149994,3.364566e+10,-0.228068,-0.276332,1.0
114917,149996,3.365069e+10,-0.228068,-0.276332,1.0
114918,149997,3.366345e+10,-0.228068,-0.276332,1.0
114919,149998,3.362189e+10,-0.228068,-0.276332,1.0


## Task 4.7

In [90]:
import pickle
with open('model_pkl_1', 'wb') as files:
    pickle.dump(lin_reg, files)

In [91]:
import joblib

In [92]:
joblib.dump(lin_reg, 'model_joblib')

['model_joblib']