### NOTE :: Some cols meaning
- 4G_rat => Tower supports 3G/4G indicator
- par_min => Minute bucket under consideration
- Beam_direction: Tower beam direction
- Cell_range: Cell tower range (How Far Can a Cell Tower Be for a Cellphone to Pick Up the Signal)
- Tilt: Cell tower tilt 
- Ran_vendor: Service Vendor
<hr>
- [RAN](https://en.wikipedia.org/wiki/Radio_access_network) - Radio Access Network
- [Backhaul](https://en.wikipedia.org/wiki/Backhaul_\(telecommunications)

- Correct adjusting tilts, or the inclination of the antenna in relation to an axis. With the tilt, we direct irradiation further down (or higher), concentrating the energy in the new desired direction.
- The tilt represents the inclination or angle of the antenna to its axis.
- When the antenna is tilted down, we call it 'downtilt', which is the most common use. If the inclination is up (very rare and extreme cases), we call 'uptilt'.
- Downtilt is used when we want to reduce interference and/or coverage in some specific areas, having each cell to meet only its designed area.

### NOTE::
- Different cell towers (given by different cell names) have different load capacity which depends on its hardware part (here, tilt, cell-tower range, beam direction).
SO, given the statistics about usage and metadata like date, we need to predict the type of congestion

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#standard plotly imports
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

#cufflinks + plotly
import cufflinks as cf
cf.go_offline(connected = True)
# set global
cf.set_config_file(world_readable = True, theme = 'pearl', offline = True)

init_notebook_mode(connected = True)

In [None]:
from scipy.stats import norm

In [None]:
sns.set_style('whitegrid', {'axes,grid' : False})

In [None]:
pd.set_option('display.max_columns', 45)

In [None]:
train_df = pd.read_csv('../input/train_upd.csv')
test_df = pd.read_csv('../input/test_upd.csv')

In [None]:
y_test = pd.read_csv('../input/y_test.csv')

In [None]:
160261 in test_df['cell_name'].values.tolist()

In [None]:
tmp1 = pd.concat((test_df, y_test.drop(columns = 'cell_name')), axis = 1)

In [None]:
train_df.head()

In [None]:
target_name = 'Congestion_Type'

In [None]:
### Check whether all cell towers are different
assert(len(train_df['cell_name'].unique()) == len(train_df))

In [None]:
train_df.drop(columns = ['cell_name', 'par_year', 'par_month']).describe()

In [None]:
print('Length of training data %d and testing data %d'%(train_df.shape[0], test_df.shape[0]))

In [None]:
print('Train - {},  Test - {}'.format(*(train_df.isnull().values.any(), test_df.isnull().values.any())))

In [None]:
fig = plt.figure(figsize = (6, 6))
total = train_df.shape[0]
ax = sns.countplot(x = target_name, data = train_df, order = train_df[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(train_df[target_name].value_counts().sort_values(ascending = False).index, rotation = 90)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(p.get_height() * 100 / total), (p.get_x() + 0.1, p.get_height() + 15))

In [None]:
fig = plt.figure(figsize = (6, 6))
total = tmp1.shape[0]
ax = sns.countplot(x = target_name, data = tmp1, order = tmp1[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(tmp1[target_name].value_counts().sort_values(ascending = False).index, rotation = 90)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(p.get_height() * 100 / total), (p.get_x() + 0.1, p.get_height() + 15))

In [None]:
plt.figure(figsize = (5, 5))
cnt = train_df[target_name].value_counts()
sizes = cnt.values
plt.pie(sizes, labels = cnt.index, autopct = '%1.1f%%', shadow = True);

In [None]:
col = '4G_rat'
fig = plt.figure(figsize = (6, 6))
total = train_df.shape[0]
ax = sns.countplot(x = col, data = train_df, order = train_df[col].value_counts().sort_values(ascending = False).index)
# ax.set_xticklabels(train_df[col].value_counts().sort_values(ascending = False).index, rotation = 90)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(p.get_height() * 100 / total), (p.get_x() + 0.3, p.get_height() + 15))

In [None]:
col = '4G_rat'
fig = plt.figure(figsize = (6, 6))
total = test_df.shape[0]
ax = sns.countplot(x = col, data = test_df, order = test_df[col].value_counts().sort_values(ascending = False).index)
# ax.set_xticklabels(train_df[col].value_counts().sort_values(ascending = False).index, rotation = 90)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(p.get_height() * 100 / total), (p.get_x() + 0.3, p.get_height() + 15))

In [None]:
col = 'ran_vendor'
fig = plt.figure(figsize = (6, 6))
total = train_df.shape[0]
ax = sns.countplot(x = col, data = train_df, order = train_df[col].value_counts().sort_values(ascending = False).index)
# ax.set_xticklabels(train_df[col].value_counts().sort_values(ascending = False).index, rotation = 90)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(p.get_height() * 100 / total), (p.get_x() + 0.3, p.get_height() + 15))

In [None]:
col = 'ran_vendor'
fig = plt.figure(figsize = (6, 6))
total = test_df.shape[0]
ax = sns.countplot(x = col, data = test_df, order = test_df[col].value_counts().sort_values(ascending = False).index)
# ax.set_xticklabels(train_df[col].value_counts().sort_values(ascending = False).index, rotation = 90)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(p.get_height() * 100 / total), (p.get_x() + 0.3, p.get_height() + 15))

In [None]:
### Relation between 4G_rat and Target var
ax = sns.countplot(x = target_name, hue = '4G_rat', data = train_df, order = train_df[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(train_df[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
ax = sns.countplot(x = target_name, hue = 'ran_vendor', data = train_df, order = train_df[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(train_df[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
ax = sns.countplot(x = '4G_rat', hue = 'ran_vendor', data = train_df)

In [None]:
ax = sns.countplot(x = '4G_rat', hue = 'ran_vendor', data = test_df)

### Tilt vs Sub Cnt

In [None]:
sns.boxplot(x = 'tilt', y = 'subscriber_count', hue = 'ran_vendor', data = train_df);

In [None]:
plt.figure(figsize = (10, 5))
sns.boxplot(x = 'tilt', y = 'subscriber_count', hue = 'ran_vendor', data = train_df.sort_values('subscriber_count').iloc[:10000]);

In [None]:
plt.figure(figsize = (20, 5))
sns.boxplot(x = 'tilt', y = 'subscriber_count', hue = 'cell_range', data = train_df.sort_values('subscriber_count').iloc[:10000]);

### Cell Range vs Sub Cnt

In [None]:
plt.figure(figsize = (15, 10))
sns.boxplot(x = 'cell_range', y = 'subscriber_count', hue = 'tilt', data = train_df)

In [None]:
plt.figure(figsize = (10, 5))
sns.boxplot(x = 'cell_range', y = 'subscriber_count', hue = 'tilt', data = train_df.sort_values('subscriber_count').loc[:10000])

In [None]:
sns.distplot(train_df['subscriber_count'].values, fit = norm)

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train_df['subscriber_count'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frequency')
plt.title('Sub Cnt');

In [None]:
sns.distplot(test_df['subscriber_count'].values, fit = norm)

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(test_df['subscriber_count'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frequency')
plt.title('Sub Cnt');

In [None]:
ax = sns.boxplot(x = target_name, y = 'subscriber_count', data = train_df, order = train_df[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(train_df[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
train_df[train_df.loc[:, target_name] == 'NC']['subscriber_count'].max()

In [None]:
train_df[train_df.loc[:, target_name] == '4G_BACKHAUL_CONGESTION']['subscriber_count'].max()

In [None]:
fig = plt.figure(figsize = (10, 13))
ax = sns.boxplot(x = target_name, y = 'subscriber_count', hue = '4G_rat', data = train_df, order = train_df[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(train_df[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = sns.boxplot(x = target_name, y = 'subscriber_count', hue = 'ran_vendor', data = train_df, order = train_df[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(train_df[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
fig = plt.figure(figsize = (10, 5))
nc_df = train_df[train_df.loc[:, target_name] == 'NC']
ax = sns.boxplot(x = target_name, y = 'subscriber_count', hue = 'ran_vendor', data = nc_df)
# ax.set_xticklabels(train_df[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = sns.boxplot(x = target_name, y = 'subscriber_count', hue = 'tilt', data = train_df, order = train_df[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(train_df[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = sns.boxplot(x = target_name, y = 'subscriber_count', hue = 'cell_range', data = train_df, order = train_df[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(train_df[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

### Can do like this, if sub_cnt >= 4000, then always congession <br> like this can do for upper limit i.e. 4G_ran_congession if sub_cnt >= 11,000

## NOTE ::
- Training => 78650,  Testing => 26305
- No null objects
- Categorical cols => 4G_rat(2), ran_vendor(3), cell_range(6), tilt(4), Congestion_Type(4)[target] 
- Traget class if balanced
- Analyse sub_cnt, all_browsing_bytes
- 4G_rat col alone is useless
- Above 2185 sub_cnt, always congession

- vendors support nearly equal no of 3G and 4G towers

<hr>
- Equal no of 3G/4G towers are supported by each ran_vendor
- As tilt increases, sub_cnt tends to increase. (Tilt vs Sub Cnt depends on no of other parameters so just a hunch)
- 4G_rat not related to sub_cnt
- ran_vendor not related to sub_cnt

## DOUBT ::
- How can 4G_RAN_CONGESTION, 4G_BACKHAUL_CONGESTION occur in tower with no 4G support???
- As different cell towers note values at different time period, so there exists an discrepancy.

## TASK ::
- Divide diff bytes into region and see the plots

In [None]:
byte_cols = train_df.columns[8:34]

In [None]:
total_bytes = train_df.loc[:, byte_cols].apply(sum, axis = 1)

In [None]:
tmp = train_df.copy(deep = True)

In [None]:
tmp['total_bytes'] = total_bytes

In [None]:
sns.distplot(tmp['total_bytes'].values, fit = norm)

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(tmp['total_bytes'].values)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frequency')
plt.title('Sub Cnt');

In [None]:
print("Skewness: %f" % tmp['total_bytes'].skew())
print("Kurtosis: %f" % tmp['total_bytes'].kurt())

In [None]:
print("Skewness: %f" % tmp['subscriber_count'].skew())
print("Kurtosis: %f" % tmp['subscriber_count'].kurt())

In [None]:
y = 'total_bytes'

In [None]:
ax = sns.boxplot(x = target_name, y = y, data = tmp, order = tmp[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(tmp[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
tmp[tmp.loc[:, target_name] == 'NC'][y].max()

In [None]:
tmp[tmp.loc[:, target_name] == '4G_BACKHAUL_CONGESTION'][y].max()

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = sns.boxplot(x = target_name, y = y, hue = '4G_rat', data = tmp, order = tmp[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(tmp[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = sns.boxplot(x = target_name, y = y, hue = 'ran_vendor', data = tmp, order = tmp[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(tmp[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
fig = plt.figure(figsize = (10, 5))
nc_df = tmp[tmp.loc[:, target_name] == 'NC']
ax = sns.boxplot(x = target_name, y = y, hue = 'ran_vendor', data = nc_df)
# ax.set_xticklabels(train_df[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = sns.boxplot(x = target_name, y = y, hue = 'tilt', data = tmp, order = tmp[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(tmp[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = sns.boxplot(x = target_name, y = y, hue = 'cell_range', data = tmp, order = tmp[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(tmp[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
sns.lmplot(x = 'subscriber_count', y = 'total_bytes', data = tmp)

## Clustering on the basis of labels

### Label encoding tmp_df

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
def label_encode(df, col):
    lb = LabelEncoder()
    df[col] = lb.fit_transform(df[col])
    return lb

In [None]:
label_encode(tmp, 'ran_vendor');

In [None]:
lb = label_encode(tmp, 'Congestion_Type')

In [None]:
tmp.drop(columns = ['cell_name', 'par_year', 'par_month', target_name], inplace = True)

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(
    n_components = 2,
    init = 'random',      
    random_state = 101,
    method = 'barnes_hut',
    n_iter = 300,
    verbose = 2,
    angle = 0.5
).fit_transform(tmp[:10000].values)

In [None]:
trace = go.Scatter3d(
    x = tsne[:, 0],
    y = tsne[:, 1],
    z = lb.transform(train_df['Congestion_Type'].values),
    mode = 'markers',
    marker = dict(
        size = 12,
        color = lb.transform(train_df['Congestion_Type'].values),
        colorscale = 'Portland',
        colorbar = dict(title = 'Congestion'),
        line = dict(color='rgb(255, 255, 255)'),
        opacity = 0.5
    )
)

layout = go.Layout(
    scene = dict(
        camera = dict(
            eye = dict(
            x = 0.5,
            y = 0.5,
            z = 0.5
            )
        )
    ),
    margin = dict(
        l = 0,
        r = 0,
        b = 0,
        t = 0
    )
)
fig = go.Figure(data = [trace], layout = layout)
iplot(fig, filename = '3d-scatter')

In [None]:
plot_df = pd.DataFrame(dict(x=tsne[:,0], y=tsne[:,1], color=lb.transform(train_df.loc[:9999, target_name])))
sns.lmplot('x', 'y', data=plot_df, hue='color', fit_reg=False, scatter_kws = {'alpha' : 0.3})
plt.show()

In [None]:
tsne = TSNE(
    n_components = 3,
    init = 'random', # pca
    random_state = 101,
    method = 'barnes_hut',
    n_iter = 300,
    verbose = 2,
    angle = 0.5
).fit_transform(tmp[:10000].values)

In [None]:
trace = go.Scatter3d(
    x = tsne[:, 0],
    y = tsne[:, 1],
    z = tsne[:, 2],
    mode = 'markers',
    marker = dict(
        size = 12,
        color = lb.transform(train_df['Congestion_Type'].values),
        colorscale = 'Portland',
        colorbar = dict(title = 'Congestion'),
        line = dict(color='rgb(255, 255, 255)'),
        opacity = 0.5
    )
)

layout = go.Layout(
    scene = dict(
        camera = dict(
            eye = dict(
            x = 0.5,
            y = 0.5,
            z = 0.5
            )
        )
    ),
    margin = dict(
        l = 0,
        r = 0,
        b = 0,
        t = 0
    )
)
fig = go.Figure(data = [trace], layout = layout)
iplot(fig, filename = '3d-scatter')

In [None]:
plot_df = pd.DataFrame(dict(x=tsne[:,0], y=tsne[:,1], color=lb.transform(train_df.loc[:9999, target_name])))
sns.lmplot('x', 'y', data=plot_df, hue='color', fit_reg=False, scatter_kws = {'alpha' : 0.3})
plt.show()

In [None]:
import umap

In [None]:
reduce = umap.UMAP(random_state = 223, n_components = 3)  #just for reproducibility
embeddings = reduce.fit_transform(tmp.values)

In [None]:
trace = go.Scatter3d(
    x = embeddings[:, 0],
    y = embeddings[:, 1],
    z = embeddings[:, 2],
    mode = 'markers',
    marker = dict(
        size = 12,
        color = lb.transform(train_df['Congestion_Type'].values),
        colorscale = 'Portland',
        colorbar = dict(title = 'Congestion'),
        line = dict(color='rgb(255, 255, 255)'),
        opacity = 0.5
    )
)

layout = go.Layout(
    scene = dict(
        camera = dict(
            eye = dict(
            x = 0.5,
            y = 0.5,
            z = 0.5
            )
        )
    ),
    margin = dict(
        l = 0,
        r = 0,
        b = 0,
        t = 0
    )
)
fig = go.Figure(data = [trace], layout = layout)
iplot(fig, filename = '3d-scatter-umap')

In [None]:
reduce = umap.UMAP(random_state = 446, n_components = 2)  #just for reproducibility
embeddings = reduce.fit_transform(tmp.values)

In [None]:
plot_df = pd.DataFrame(embeddings, columns = ['x', 'y'])
plot_df[target_name] = train_df.loc[:, target_name]

In [None]:
# fig = plt.Figure(figsize = (15, 15))
ax = sns.pairplot(x_vars = ['x'], y_vars = ['y'], data = plot_df, hue = target_name, kind = 'scatter', size = 11, plot_kws = {'s' : 80, 'alpha' : 0.6})
ax.fig.suptitle('Embedding clustered with UMAP');

In [None]:
trace = go.Scatter3d(
    x = embeddings[:, 0],
    y = embeddings[:, 1],
    z = lb.transform(train_df['Congestion_Type'].values),
    mode = 'markers',
    marker = dict(
        size = 12,
        color = lb.transform(train_df['Congestion_Type'].values),
        colorscale = 'Portland',
        colorbar = dict(title = 'Congestion'),
        line = dict(color='rgb(255, 255, 255)'),
        opacity = 0.5
    )
)

layout = go.Layout(
    scene = dict(
        camera = dict(
            eye = dict(
            x = 0.5,
            y = 0.5,
            z = 0.5
            )
        )
    ),
    margin = dict(
        l = 0,
        r = 0,
        b = 0,
        t = 0
    )
)
fig = go.Figure(data = [trace], layout = layout)
iplot(fig, filename = '3d-scatter-umap-with-label')

### Using HDBSCAN to get the no of clusters

In [None]:
!pip install hdbscan

In [None]:
reduce = umap.UMAP(random_state = 1333, n_components = 2)  #just for reproducibility
embeddings = reduce.fit_transform(tmp.values)

In [None]:
import hdbscan

In [None]:
import time

In [None]:
plot_kwds = {'alpha' : 0.4, 's' : 80, 'linewidths':0}

In [None]:
def plot_clusters(data, algorithm, args, kwds):
    start_time = time.time()
    clusterer = algorithm(*args, **kwds).fit(data)
    labels, strength = hdbscan.approximate_predict(clusterer, data)
    end_time = time.time()
    palette = sns.color_palette('deep', np.unique(labels).max() + 1)
    colors = [sns.desaturate(palette[col], sat) for col, sat in zip(clusterer.labels_, clusterer.probabilities_)]
#     colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
    plt.scatter(data[:, 0], data[:, 1], c=colors, **plot_kwds)
    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)
    plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
    plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14)
    return clusterer

In [None]:
clusterer = plot_clusters(embeddings, hdbscan.HDBSCAN, (), {'prediction_data':True, 'min_cluster_size':15})

In [None]:
np.unique(clusterer.labels_)

In [None]:
clusterer = plot_clusters(tsne, hdbscan.HDBSCAN, (), {'prediction_data':True, 'min_cluster_size':15})

In [None]:
np.unique(clusterer.labels_)

## Preprocess tmp copy and repeat the cluster formation steps

In [None]:
tmp_copy = tmp.copy(deep = True)

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [None]:
def normalize_col(normalize, df, col):
    if normalize == 1:
        scaler = MinMaxScaler(feature_range = (0, 1))
        df[col] = scaler.fit_transform(np.expand_dims(df[col].values, axis = 1))
    
    elif normalize == 2:
        scaler = StandardScaler()
        df[col] = scaler.fit_transform(np.expand_dims(df[col].values, axis = 1))
    
    elif normalize == 3:
        scaler = RobustScaler()
        df[col] = scaler.fit_transform(np.expand_dims(df[col].values, axis = 1))

In [None]:
cols2normalize = list(tmp_copy.columns[4:32]) + [tmp_copy.columns[-1]]

In [None]:
for col in cols2normalize:
    normalize_col(3, tmp_copy, col)

In [None]:
tmp_copy.describe()

In [None]:
tsne = TSNE(
    n_components = 3,
    init = 'random', # pca
    random_state = 101,
    method = 'barnes_hut',
    n_iter = 300,
    verbose = 2,
    angle = 0.5
).fit_transform(tmp_copy[:10000].values)

In [None]:
trace = go.Scatter3d(
    x = tsne[:, 0],
    y = tsne[:, 1],
    z = tsne[:, 2],
    mode = 'markers',
    marker = dict(
        size = 12,
        color = lb.transform(train_df['Congestion_Type'].values),
        colorscale = 'Portland',
        colorbar = dict(title = 'Congestion'),
        line = dict(color='rgb(255, 255, 255)'),
        opacity = 0.5
    )
)

layout = go.Layout(
    scene = dict(
        camera = dict(
            eye = dict(
            x = 0.5,
            y = 0.5,
            z = 0.5
            )
        )
    ),
    margin = dict(
        l = 0,
        r = 0,
        b = 0,
        t = 0
    )
)
fig = go.Figure(data = [trace], layout = layout)
iplot(fig, filename = '3d-scatter')

In [None]:
plot_df = pd.DataFrame(dict(x=tsne[:,0], y=tsne[:,1], color=lb.transform(train_df.loc[:9999, target_name])))
sns.lmplot('x', 'y', data=plot_df, hue='color', fit_reg=False, scatter_kws = {'alpha' : 0.3})
plt.show()

In [None]:
tsne = TSNE(
    n_components = 2,
    init = 'random',      
    random_state = 101,
    method = 'barnes_hut',
    n_iter = 300,
    verbose = 2,
    angle = 0.5
).fit_transform(tmp_copy[:10000].values)

In [None]:
trace = go.Scatter3d(
    x = tsne[:, 0],
    y = tsne[:, 1],
    z = lb.transform(train_df['Congestion_Type'].values),
    mode = 'markers',
    marker = dict(
        size = 12,
        color = lb.transform(train_df['Congestion_Type'].values),
        colorscale = 'Portland',
        colorbar = dict(title = 'Congestion'),
        line = dict(color='rgb(255, 255, 255)'),
        opacity = 0.5
    )
)

layout = go.Layout(
    scene = dict(
        camera = dict(
            eye = dict(
            x = 0.5,
            y = 0.5,
            z = 0.5
            )
        )
    ),
    margin = dict(
        l = 0,
        r = 0,
        b = 0,
        t = 0
    )
)
fig = go.Figure(data = [trace], layout = layout)
iplot(fig, filename = '3d-scatter')

In [None]:
plot_df = pd.DataFrame(dict(x=tsne[:,0], y=tsne[:,1], color=lb.transform(train_df.loc[:9999, target_name])))
sns.lmplot('x', 'y', data=plot_df, hue='color', fit_reg=False, scatter_kws = {'alpha' : 0.3})
plt.show()

In [None]:
import umap

In [None]:
reduce = umap.UMAP(random_state = 223, n_components = 3)  #just for reproducibility
embeddings = reduce.fit_transform(tmp_copy.values)

In [None]:
trace = go.Scatter3d(
    x = embeddings[:, 0],
    y = embeddings[:, 1],
    z = embeddings[:, 2],
    mode = 'markers',
    marker = dict(
        size = 12,
        color = lb.transform(train_df['Congestion_Type'].values),
        colorscale = 'Portland',
        colorbar = dict(title = 'Congestion'),
        line = dict(color='rgb(255, 255, 255)'),
        opacity = 0.5
    )
)

layout = go.Layout(
    scene = dict(
        camera = dict(
            eye = dict(
            x = 0.5,
            y = 0.5,
            z = 0.5
            )
        )
    ),
    margin = dict(
        l = 0,
        r = 0,
        b = 0,
        t = 0
    )
)
fig = go.Figure(data = [trace], layout = layout)
iplot(fig, filename = '3d-scatter-umap')

In [None]:
reduce = umap.UMAP(random_state = 446, n_components = 2)  #just for reproducibility
embeddings = reduce.fit_transform(tmp_copy.values)

In [None]:
plot_df = pd.DataFrame(embeddings, columns = ['x', 'y'])
plot_df[target_name] = train_df.loc[:, target_name]

In [None]:
# fig = plt.Figure(figsize = (15, 15))
ax = sns.pairplot(x_vars = ['x'], y_vars = ['y'], data = plot_df, hue = target_name, kind = 'scatter', size = 11, plot_kws = {'s' : 80, 'alpha' : 0.6})
ax.fig.suptitle('Embedding clustered with UMAP');

In [None]:
trace = go.Scatter3d(
    x = embeddings[:, 0],
    y = embeddings[:, 1],
    z = lb.transform(train_df['Congestion_Type'].values),
    mode = 'markers',
    marker = dict(
        size = 12,
        color = lb.transform(train_df['Congestion_Type'].values),
        colorscale = 'Portland',
        colorbar = dict(title = 'Congestion'),
        line = dict(color='rgb(255, 255, 255)'),
        opacity = 0.5
    )
)

layout = go.Layout(
    scene = dict(
        camera = dict(
            eye = dict(
            x = 0.5,
            y = 0.5,
            z = 0.5
            )
        )
    ),
    margin = dict(
        l = 0,
        r = 0,
        b = 0,
        t = 0
    )
)
fig = go.Figure(data = [trace], layout = layout)
iplot(fig, filename = '3d-scatter-umap-with-label')

## Using Date data

In [None]:
df = train_df.copy(deep = True)

In [None]:
from datetime import datetime, date

In [None]:
df.insert(7, 'day', np.nan)
df.head()

## monday -> 0, sunday -> 6

In [None]:
df['day'] = df.apply(lambda x : datetime.strptime("%s|%s|%s"%(x['par_day'], x['par_month'], x['par_year']), "%d|%m|%Y").date().weekday(), axis = 1)

In [None]:
df.head()

In [None]:
byte_cols = df.columns[9:35]

In [None]:
df['total_bytes'] = df.loc[:, byte_cols].apply(sum, axis = 1)

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = sns.boxplot(x = 'day', y = 'total_bytes', hue = target_name, data = df)#, order = df[].value_counts().sort_values(ascending = False).index)
# ax.set_xticklabels(df[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
fig = plt.figure(figsize = (6, 6))
total = df.shape[0]
ax = sns.barplot(x = 'day', y = 'total_bytes', data = df)
# ax.set_xticklabels(train_df[col].value_counts().sort_values(ascending = False).index, rotation = 90)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(p.get_height() * 100 / total), (p.get_x() + 0.1, p.get_height() + 15))

## Removing date info and again forming cluseters

In [None]:
tmp_drop_date = tmp.copy(deep = True)

In [None]:
tmp_drop_date.drop(columns = ['par_day', 'par_hour', 'par_min'], inplace = True)

In [None]:
tsne = TSNE(
    n_components = 2,
    init = 'random',      
    random_state = 101,
    method = 'barnes_hut',
    n_iter = 300,
    verbose = 2,
    angle = 0.5
).fit_transform(tmp_drop_date[:10000].values)

In [None]:
trace = go.Scatter3d(
    x = tsne[:, 0],
    y = tsne[:, 1],
    z = lb.transform(train_df['Congestion_Type'].values),
    mode = 'markers',
    marker = dict(
        size = 12,
        color = lb.transform(train_df['Congestion_Type'].values),
        colorscale = 'Portland',
        colorbar = dict(title = 'Congestion'),
        line = dict(color='rgb(255, 255, 255)'),
        opacity = 0.5
    )
)

layout = go.Layout(
    scene = dict(
        camera = dict(
            eye = dict(
            x = 0.5,
            y = 0.5,
            z = 0.5
            )
        )
    ),
    margin = dict(
        l = 0,
        r = 0,
        b = 0,
        t = 0
    )
)
fig = go.Figure(data = [trace], layout = layout)
iplot(fig, filename = '3d-scatter')

In [None]:
plot_df = pd.DataFrame(dict(x=tsne[:,0], y=tsne[:,1], color=lb.transform(train_df.loc[:9999, target_name])))
sns.lmplot('x', 'y', data=plot_df, hue='color', fit_reg=False, scatter_kws = {'alpha' : 0.3})
plt.show()

In [None]:
tsne = TSNE(
    n_components = 3,
    init = 'random', # pca
    random_state = 101,
    method = 'barnes_hut',
    n_iter = 300,
    verbose = 2,
    angle = 0.5
).fit_transform(tmp_drop_date[:10000].values)

In [None]:
trace = go.Scatter3d(
    x = tsne[:, 0],
    y = tsne[:, 1],
    z = tsne[:, 2],
    mode = 'markers',
    marker = dict(
        size = 12,
        color = lb.transform(train_df['Congestion_Type'].values),
        colorscale = 'Portland',
        colorbar = dict(title = 'Congestion'),
        line = dict(color='rgb(255, 255, 255)'),
        opacity = 0.5
    )
)

layout = go.Layout(
    scene = dict(
        camera = dict(
            eye = dict(
            x = 0.5,
            y = 0.5,
            z = 0.5
            )
        )
    ),
    margin = dict(
        l = 0,
        r = 0,
        b = 0,
        t = 0
    )
)
fig = go.Figure(data = [trace], layout = layout)
iplot(fig, filename = '3d-scatter')

In [None]:
plot_df = pd.DataFrame(dict(x=tsne[:,0], y=tsne[:,1], color=lb.transform(train_df.loc[:9999, target_name])))
sns.lmplot('x', 'y', data=plot_df, hue='color', fit_reg=False, scatter_kws = {'alpha' : 0.3})
plt.show()

In [None]:
import umap

In [None]:
reduce = umap.UMAP(random_state = 223, n_components = 3)  #just for reproducibility
embeddings = reduce.fit_transform(tmp_drop_date.values)

In [None]:
trace = go.Scatter3d(
    x = embeddings[:, 0],
    y = embeddings[:, 1],
    z = embeddings[:, 2],
    mode = 'markers',
    marker = dict(
        size = 12,
        color = lb.transform(train_df['Congestion_Type'].values),
        colorscale = 'Portland',
        colorbar = dict(title = 'Congestion'),
        line = dict(color='rgb(255, 255, 255)'),
        opacity = 0.5
    )
)

layout = go.Layout(
    scene = dict(
        camera = dict(
            eye = dict(
            x = 0.5,
            y = 0.5,
            z = 0.5
            )
        )
    ),
    margin = dict(
        l = 0,
        r = 0,
        b = 0,
        t = 0
    )
)
fig = go.Figure(data = [trace], layout = layout)
iplot(fig, filename = '3d-scatter-umap')

In [None]:
reduce = umap.UMAP(random_state = 446, n_components = 2)  #just for reproducibility
embeddings = reduce.fit_transform(tmp_drop_date.values)

In [None]:
plot_df = pd.DataFrame(embeddings, columns = ['x', 'y'])
plot_df[target_name] = train_df.loc[:, target_name]

In [None]:
# fig = plt.Figure(figsize = (15, 15))
ax = sns.pairplot(x_vars = ['x'], y_vars = ['y'], data = plot_df, hue = target_name, kind = 'scatter', size = 11, plot_kws = {'s' : 80, 'alpha' : 0.6})
ax.fig.suptitle('Embedding clustered with UMAP');

In [None]:
trace = go.Scatter3d(
    x = embeddings[:, 0],
    y = embeddings[:, 1],
    z = lb.transform(train_df['Congestion_Type'].values),
    mode = 'markers',
    marker = dict(
        size = 12,
        color = lb.transform(train_df['Congestion_Type'].values),
        colorscale = 'Portland',
        colorbar = dict(title = 'Congestion'),
        line = dict(color='rgb(255, 255, 255)'),
        opacity = 0.5
    )
)

layout = go.Layout(
    scene = dict(
        camera = dict(
            eye = dict(
            x = 0.5,
            y = 0.5,
            z = 0.5
            )
        )
    ),
    margin = dict(
        l = 0,
        r = 0,
        b = 0,
        t = 0
    )
)
fig = go.Figure(data = [trace], layout = layout)
iplot(fig, filename = '3d-scatter-umap-with-label')

### Using HDBSCAN to get the no of clusters

In [None]:
reduce = umap.UMAP(random_state = 1333, n_components = 2)  #just for reproducibility
embeddings = reduce.fit_transform(tmp_drop_date.values)

In [None]:
plot_kwds = {'alpha' : 0.4, 's' : 80, 'linewidths':0}

In [None]:
def plot_clusters(data, algorithm, args, kwds):
    start_time = time.time()
    clusterer = algorithm(*args, **kwds).fit(data)
    labels, strength = hdbscan.approximate_predict(clusterer, data)
    end_time = time.time()
    palette = sns.color_palette('deep', np.unique(labels).max() + 1)
    colors = [sns.desaturate(palette[col], sat) for col, sat in zip(clusterer.labels_, clusterer.probabilities_)]
#     colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
    plt.scatter(data[:, 0], data[:, 1], c=colors, **plot_kwds)
    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)
    plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
    plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14)
    return clusterer

In [None]:
clusterer = plot_clusters(embeddings, hdbscan.HDBSCAN, (), {'prediction_data':True, 'min_cluster_size':15})

In [None]:
np.unique(clusterer.labels_)

In [None]:
clusterer = plot_clusters(tsne, hdbscan.HDBSCAN, (), {'prediction_data':True, 'min_cluster_size':15})

In [None]:
np.unique(clusterer.labels_)

### PCA, Cluster information, Date/No-date and seeing results
### Will work with tmp and tmp_drop

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split,cross_val_predict,cross_val_score,cross_validate
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.linear_model import SGDClassifier,Perceptron,PassiveAggressiveClassifier,RidgeClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectFromModel,SelectKBest,chi2
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier,NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

In [None]:
# for col in tmp.columns:
#     normalize_col(3, tmp, col)

# for col in tmp_drop_date.columns:
#     normalize_col(3, tmp_drop_date, col)

In [None]:
tmp.head()

### TASK ::
- Try not to scale categorical columns

In [None]:
tmp_cols = tmp.columns[4 : -4].values.tolist() + [tmp.columns[-1]]

In [None]:
tmp_cols

In [None]:
tmp_drop_date.head()

In [None]:
tmp_date_cols = tmp_drop_date.columns[1 : -4].values.tolist() + [tmp_drop_date.columns[-1]]

In [None]:
tmp_date_cols

In [None]:
assert(tmp_cols == tmp_date_cols)

In [None]:
for col in tmp_cols:
    normalize_col(3, tmp, col)

for col in tmp_date_cols:
    normalize_col(3, tmp_drop_date, col)

In [None]:
X = tmp.values
X_date = tmp_drop_date.values

In [None]:
y = lb.transform(train_df['Congestion_Type'])
y

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 4242, stratify = y)
x_date_train, x_date_valid, y_train, y_valid = train_test_split(X_date, y, test_size = 0.2, random_state = 2424, stratify = y)

In [None]:
target_names = lb.classes_
target_names

In [None]:
from time import time

In [None]:
def benchmark(clf, name = None, x_train = x_train, x_valid = x_valid):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(x_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(x_valid)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)
    
    score = metrics.accuracy_score(y_valid, pred)
    
    train_pred = clf.predict(x_train)
    train_score = metrics.accuracy_score(y_train, train_pred)
    
    print("Train accuracy:   %0.3f" % train_score)
    print("Validation accuracy:   %0.3f" % score)

    
    print("classification report: Valid")
    print(metrics.classification_report(y_valid, pred,
                                            target_names=target_names))

    print("classification report: Train")
    print(metrics.classification_report(y_train,train_pred,
                                            target_names=target_names))
    
    print("confusion matrix:Valid")
    cm = metrics.confusion_matrix(y_valid, pred)
    print(cm)
    
    print("confusion matrix: Train")
    m = metrics.confusion_matrix(y_train,train_pred)
    print(m)
    
    fig = plt.figure(figsize=(18,20))
    ax = sns.heatmap(cm, annot=True, annot_kws={"size": 14},
            fmt='g', cmap='OrRd', xticklabels=target_names, yticklabels=target_names)
    txt = name + ' method has an accurcy of ' + str(100*score)
    fig.text(.5, .05, txt, ha='center',fontsize='xx-large')
    plt.tight_layout()
#     plt.savefig('./plots/' + name + '.jpg')

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

In [None]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."

In [None]:
results = []
for clf, name in (
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'lbfgs', multi_class = 'multinomial', max_iter = 1000), "LR"),
#         (SVC(), "SVC"),
        (RidgeClassifier(tol=1e-2, solver="lsqr", max_iter=300), "Ridge Classifier"),
        (Perceptron(max_iter=300, tol=1e-3, n_jobs=-1), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=2000, tol=1e-5, n_jobs=-1), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=5, n_jobs=-1), "kNN"),
        (RandomForestClassifier(n_estimators=100, n_jobs = -1), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf,name,x_train, x_valid))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,max_iter=3000,
                                       tol=1e-4),'LinearSVC',x_train, x_valid))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=2000, tol=1e-4,
                                           penalty=penalty),'SGDClassifer_'+penalty,x_train, x_valid))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=1000, tol=1e-4,
                                       penalty="elasticnet"),'SGDClassifer_elasticnet',x_train, x_valid))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
                                                  tol=1e-4, ))),
  ('classification', LinearSVC(penalty="l2", class_weight = 'balanced'))]),'pipeline',x_train, x_valid))

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
results = []
for clf, name in (
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'lbfgs', multi_class = 'multinomial', max_iter = 1000), "LR_LBFGS"),
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'saga', multi_class = 'multinomial', max_iter = 1000, penalty = 'l1'), "LR_SAGA"),
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'newton-cg', multi_class = 'multinomial', max_iter = 1000), "LR_NEWTON-CG"),
        (SVC(gamma = 'auto'), "SVC"),
        (GaussianNB(), "NB"),
        (DecisionTreeClassifier(random_state=0), "Decision"),
        (RandomForestClassifier(n_estimators=100,max_depth=2,random_state=0, n_jobs = -1), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf,name,x_date_train, x_date_valid))

## With PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components = 20)
X = pca.fit_transform(tmp.values)

In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
pca = PCA(n_components = 20)
X_date = pca.fit_transform(tmp_drop_date.values)

In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
X = np.concatenate((tmp.values, X), axis = 1)

In [None]:
X_date = np.concatenate((tmp_drop_date.values, X_date), axis = 1)

In [None]:
X.shape, X_date.shape, y.shape

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 4242, stratify = y)
x_date_train, x_date_valid, y_train, y_valid = train_test_split(X_date, y, test_size = 0.2, random_state = 2424, stratify = y)

In [None]:
target_names = lb.classes_
target_names

In [None]:
results = []
for clf, name in (
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'lbfgs', multi_class = 'multinomial', max_iter = 1000), "LR_LBFGS"),
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'saga', multi_class = 'multinomial', max_iter = 1000, penalty = 'l1'), "LR_SAGA"),
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'newton-cg', multi_class = 'multinomial', max_iter = 1000), "LR_NEWTON-CG"),
        (SVC(gamma = 'auto'), "SVC"),
        (GaussianNB(), "NB"),
        (DecisionTreeClassifier(random_state=0), "Decision"),
        (RandomForestClassifier(n_estimators=100,max_depth=2,random_state=0, n_jobs = -1), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf,name,x_date_train, x_date_valid))

### PCA ON BYTES COL ONLY

In [133]:
tmp_bytes = tmp.copy(deep = True)
tmp_drop_date_bytes = tmp_drop_date.copy(deep = True)

In [134]:
tmp_bytes.head()

Unnamed: 0,4G_rat,par_day,par_hour,par_min,subscriber_count,web_browsing_total_bytes,video_total_bytes,social_ntwrking_bytes,cloud_computing_total_bytes,web_security_total_bytes,gaming_total_bytes,health_total_bytes,communication_total_bytes,file_sharing_total_bytes,remote_access_total_bytes,photo_sharing_total_bytes,software_dwnld_total_bytes,marketplace_total_bytes,storage_services_total_bytes,audio_total_bytes,location_services_total_bytes,presence_total_bytes,advertisement_total_bytes,system_total_bytes,voip_total_bytes,speedtest_total_bytes,email_total_bytes,weather_total_bytes,media_total_bytes,mms_total_bytes,others_total_bytes,beam_direction,cell_range,tilt,ran_vendor,total_bytes
0,1,15,0,50,-0.070423,-0.200603,8.940787,0.089303,1.612595,1.301587,5.870968,0.578125,2.904192,-0.5,0.352941,-0.294118,0.617647,-0.448276,-0.33871,0.62069,0.403226,3.344599,0.935484,-0.241935,-0.241935,-0.322581,-0.322581,0.047619,0.262295,0.145161,1.177419,-0.7,6,2,0,1.964124
1,1,7,20,5,-0.267606,0.053065,-0.208709,0.435507,-0.240458,-0.031746,-0.274194,-0.3125,-0.233234,-0.5,-0.294118,-0.147059,-0.117647,-0.310345,-0.209677,0.275862,-0.451613,-0.202329,0.258065,-0.387097,-0.193548,-0.354839,-0.129032,-0.15873,-0.377049,-0.322581,-0.145161,0.1,7,5,2,-0.396529
2,1,5,16,35,0.181087,-0.107352,-0.113437,3.375029,-0.034351,-0.206349,-0.403226,0.328125,0.869461,-0.375,-0.117647,0.5,0.441176,-0.068966,0.241935,-0.206897,-0.435484,4.524473,-0.354839,0.16129,1.854839,-0.112903,-0.258065,0.968254,0.311475,1.887097,2.080645,-0.433333,6,3,0,0.765574
3,0,16,21,60,-0.273642,1.146316,-0.18376,0.959401,-0.234733,0.460317,-0.193548,-0.265625,0.359281,0.5,0.0,2.323529,-0.294118,-0.448276,-0.258065,2.206897,0.16129,-0.228515,-0.290323,-0.370968,0.064516,-0.435484,-0.322581,0.52381,0.196721,0.725806,0.080645,0.166667,7,2,1,-0.020801
4,1,4,6,60,-0.140845,0.087789,-0.096129,0.850524,0.082061,0.444444,0.951613,-0.1875,-0.092515,0.375,4.647059,-0.147059,0.529412,1.896552,-0.129032,0.862069,13.048387,0.017812,1.370968,0.193548,3.903226,-0.419355,0.596774,0.460317,1.0,2.693548,4.612903,0.733333,3,5,1,-0.18154


In [135]:
tmp_cols_bytes = tmp_bytes.columns[5 : -5].values.tolist() + [tmp_bytes.columns[-1]]
tmp_cols_bytes

['web_browsing_total_bytes',
 'video_total_bytes',
 'social_ntwrking_bytes',
 'cloud_computing_total_bytes',
 'web_security_total_bytes',
 'gaming_total_bytes',
 'health_total_bytes',
 'communication_total_bytes',
 'file_sharing_total_bytes',
 'remote_access_total_bytes',
 'photo_sharing_total_bytes',
 'software_dwnld_total_bytes',
 'marketplace_total_bytes',
 'storage_services_total_bytes',
 'audio_total_bytes',
 'location_services_total_bytes',
 'presence_total_bytes',
 'advertisement_total_bytes',
 'system_total_bytes',
 'voip_total_bytes',
 'speedtest_total_bytes',
 'email_total_bytes',
 'weather_total_bytes',
 'media_total_bytes',
 'mms_total_bytes',
 'others_total_bytes',
 'total_bytes']

In [136]:
tmp_drop_date_bytes.head()

Unnamed: 0,4G_rat,subscriber_count,web_browsing_total_bytes,video_total_bytes,social_ntwrking_bytes,cloud_computing_total_bytes,web_security_total_bytes,gaming_total_bytes,health_total_bytes,communication_total_bytes,file_sharing_total_bytes,remote_access_total_bytes,photo_sharing_total_bytes,software_dwnld_total_bytes,marketplace_total_bytes,storage_services_total_bytes,audio_total_bytes,location_services_total_bytes,presence_total_bytes,advertisement_total_bytes,system_total_bytes,voip_total_bytes,speedtest_total_bytes,email_total_bytes,weather_total_bytes,media_total_bytes,mms_total_bytes,others_total_bytes,beam_direction,cell_range,tilt,ran_vendor,total_bytes
0,1,-0.070423,-0.200603,8.940787,0.089303,1.612595,1.301587,5.870968,0.578125,2.904192,-0.5,0.352941,-0.294118,0.617647,-0.448276,-0.33871,0.62069,0.403226,3.344599,0.935484,-0.241935,-0.241935,-0.322581,-0.322581,0.047619,0.262295,0.145161,1.177419,-0.7,6,2,0,1.964124
1,1,-0.267606,0.053065,-0.208709,0.435507,-0.240458,-0.031746,-0.274194,-0.3125,-0.233234,-0.5,-0.294118,-0.147059,-0.117647,-0.310345,-0.209677,0.275862,-0.451613,-0.202329,0.258065,-0.387097,-0.193548,-0.354839,-0.129032,-0.15873,-0.377049,-0.322581,-0.145161,0.1,7,5,2,-0.396529
2,1,0.181087,-0.107352,-0.113437,3.375029,-0.034351,-0.206349,-0.403226,0.328125,0.869461,-0.375,-0.117647,0.5,0.441176,-0.068966,0.241935,-0.206897,-0.435484,4.524473,-0.354839,0.16129,1.854839,-0.112903,-0.258065,0.968254,0.311475,1.887097,2.080645,-0.433333,6,3,0,0.765574
3,0,-0.273642,1.146316,-0.18376,0.959401,-0.234733,0.460317,-0.193548,-0.265625,0.359281,0.5,0.0,2.323529,-0.294118,-0.448276,-0.258065,2.206897,0.16129,-0.228515,-0.290323,-0.370968,0.064516,-0.435484,-0.322581,0.52381,0.196721,0.725806,0.080645,0.166667,7,2,1,-0.020801
4,1,-0.140845,0.087789,-0.096129,0.850524,0.082061,0.444444,0.951613,-0.1875,-0.092515,0.375,4.647059,-0.147059,0.529412,1.896552,-0.129032,0.862069,13.048387,0.017812,1.370968,0.193548,3.903226,-0.419355,0.596774,0.460317,1.0,2.693548,4.612903,0.733333,3,5,1,-0.18154


In [137]:
tmp_drop_date_cols_bytes = tmp_drop_date_bytes.columns[2 : -5].values.tolist() + [tmp_drop_date_bytes.columns[-1]]
tmp_drop_date_cols_bytes

['web_browsing_total_bytes',
 'video_total_bytes',
 'social_ntwrking_bytes',
 'cloud_computing_total_bytes',
 'web_security_total_bytes',
 'gaming_total_bytes',
 'health_total_bytes',
 'communication_total_bytes',
 'file_sharing_total_bytes',
 'remote_access_total_bytes',
 'photo_sharing_total_bytes',
 'software_dwnld_total_bytes',
 'marketplace_total_bytes',
 'storage_services_total_bytes',
 'audio_total_bytes',
 'location_services_total_bytes',
 'presence_total_bytes',
 'advertisement_total_bytes',
 'system_total_bytes',
 'voip_total_bytes',
 'speedtest_total_bytes',
 'email_total_bytes',
 'weather_total_bytes',
 'media_total_bytes',
 'mms_total_bytes',
 'others_total_bytes',
 'total_bytes']

In [138]:
pca = PCA(n_components = 20)
X_tmp_bytes = pca.fit_transform(tmp_bytes.loc[:, tmp_cols_bytes].values)

In [139]:
pca.explained_variance_ratio_.sum()

0.8778890662735208

In [140]:
pca = PCA(n_components = 20)
X_tmp_date_bytes = pca.fit_transform(tmp_drop_date_bytes.loc[:, tmp_drop_date_cols_bytes].values)

In [141]:
pca.explained_variance_ratio_.sum()

0.8778890662735216

In [142]:
tmp_bytes.drop(columns = tmp_cols_bytes, inplace = True)
tmp_drop_date_bytes.drop(columns = tmp_drop_date_cols_bytes, inplace = True)

In [143]:
tmp_bytes.head()

Unnamed: 0,4G_rat,par_day,par_hour,par_min,subscriber_count,beam_direction,cell_range,tilt,ran_vendor
0,1,15,0,50,-0.070423,-0.7,6,2,0
1,1,7,20,5,-0.267606,0.1,7,5,2
2,1,5,16,35,0.181087,-0.433333,6,3,0
3,0,16,21,60,-0.273642,0.166667,7,2,1
4,1,4,6,60,-0.140845,0.733333,3,5,1


In [144]:
tmp_drop_date_bytes.head()

Unnamed: 0,4G_rat,subscriber_count,beam_direction,cell_range,tilt,ran_vendor
0,1,-0.070423,-0.7,6,2,0
1,1,-0.267606,0.1,7,5,2
2,1,0.181087,-0.433333,6,3,0
3,0,-0.273642,0.166667,7,2,1
4,1,-0.140845,0.733333,3,5,1


In [145]:
X = np.concatenate((tmp_bytes.values, X_tmp_bytes), axis = 1)

In [146]:
X_date = np.concatenate((tmp_drop_date_bytes.values, X_tmp_date_bytes), axis = 1)

In [147]:
y = lb.transform(train_df['Congestion_Type'])
y

array([1, 3, 0, ..., 1, 0, 3])

In [148]:
X.shape, X_date.shape, y.shape

((78560, 29), (78560, 26), (78560,))

In [149]:
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 4242, stratify = y)
x_date_train, x_date_valid, y_train, y_valid = train_test_split(X_date, y, test_size = 0.2, random_state = 2424, stratify = y)

In [150]:
target_names = lb.classes_
target_names

array(['3G_BACKHAUL_CONGESTION', '4G_BACKHAUL_CONGESTION',
       '4G_RAN_CONGESTION', 'NC'], dtype=object)

In [None]:
results = []
for clf, name in (
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'lbfgs', multi_class = 'multinomial', max_iter = 1000), "LR_LBFGS"),
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'saga', multi_class = 'multinomial', max_iter = 1000, penalty = 'l1'), "LR_SAGA"),
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'newton-cg', multi_class = 'multinomial', max_iter = 1000), "LR_NEWTON-CG"),
        (SVC(gamma = 'auto'), "SVC"),
        (GaussianNB(), "NB"),
        (DecisionTreeClassifier(random_state=0), "Decision"),
        (RandomForestClassifier(n_estimators=100,max_depth=2,random_state=0, n_jobs = -1), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf,name,x_date_train, x_date_valid))


In [None]:
results = []
for clf, name in (
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'lbfgs', multi_class = 'multinomial', max_iter = 1000), "LR_LBFGS"),
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'saga', multi_class = 'multinomial', max_iter = 1000, penalty = 'l1'), "LR_SAGA"),
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'newton-cg', multi_class = 'multinomial', max_iter = 1000), "LR_NEWTON-CG"),
        (SVC(gamma = 'auto'), "SVC"),
        (RidgeClassifier(tol=1e-2, solver="lsqr", max_iter=300), "Ridge Classifier"),
        (Perceptron(max_iter=300, tol=1e-3, n_jobs=-1), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=2000, tol=1e-5, n_jobs=-1), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=5, n_jobs=-1), "kNN"),
        (RandomForestClassifier(n_estimators=100, n_jobs = -1), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf,name,x_date_train, x_date_valid))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,max_iter=3000,
                                       tol=1e-4),'LinearSVC',x_date_train, x_date_valid))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=2000, tol=1e-4,
                                           penalty=penalty),'SGDClassifer_'+penalty,x_date_train, x_date_valid))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=1000, tol=1e-4,
                                       penalty="elasticnet"),'SGDClassifer_elasticnet',x_date_train, x_date_valid))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, max_iter = 3000,
                                                  tol=1e-4, ))),
  ('classification', LinearSVC(penalty="l2", class_weight = 'balanced'))]),'pipeline',x_date_train, x_date_valid))

## Breaking the label into 3 parts

In [None]:
byte_cols = train_df.columns[8:34]

In [None]:
total_bytes = train_df.loc[:, byte_cols].apply(sum, axis = 1)

In [None]:
tmp = train_df.copy(deep = True)

In [None]:
tmp['total_bytes'] = total_bytes

In [None]:
total_bytes = tmp1.loc[:, byte_cols].apply(sum, axis = 1)

In [None]:
tmp1['total_bytes'] = total_bytes

In [None]:
sns.distplot(tmp['total_bytes'].values, fit = norm)

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(tmp['total_bytes'].values)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frequency')
plt.title('Sub Cnt');

In [None]:
sns.distplot(tmp1['total_bytes'].values, fit = norm)

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(tmp1['total_bytes'].values)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frequency')
plt.title('Sub Cnt');

In [None]:
print("Skewness: %f" % tmp['total_bytes'].skew())
print("Kurtosis: %f" % tmp['total_bytes'].kurt())

In [None]:
print("Skewness: %f" % tmp['subscriber_count'].skew())
print("Kurtosis: %f" % tmp['subscriber_count'].kurt())

In [None]:
print("Skewness: %f" % tmp1['total_bytes'].skew())
print("Kurtosis: %f" % tmp1['total_bytes'].kurt())

In [None]:
print("Skewness: %f" % tmp1['subscriber_count'].skew())
print("Kurtosis: %f" % tmp1['subscriber_count'].kurt())

In [None]:
ax = sns.boxplot(x = target_name, y = 'subscriber_count', data = tmp, order = tmp[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(tmp[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
tmp[tmp.loc[:, target_name] == 'NC']['subscriber_count'].max()

In [None]:
tmp[tmp.loc[:, target_name] == '4G_BACKHAUL_CONGESTION']['subscriber_count'].max()

In [None]:
x = tmp[tmp.loc[:, target_name] == 'NC']
x[x.loc[:, 'subscriber_count'] < 4000]['total_bytes'].max()

In [None]:
x = tmp[tmp.loc[:, target_name] == 'NC']
x[x.loc[:, 'total_bytes'] < 12000]['subscriber_count'].max()

In [None]:
ax = sns.boxplot(x = target_name, y = 'subscriber_count', data = tmp1, order = tmp1[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(tmp1[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
tmp1[tmp1.loc[:, target_name] == 'NC']['subscriber_count'].max()

In [None]:
tmp1[tmp1.loc[:, target_name] == '4G_BACKHAUL_CONGESTION']['subscriber_count'].max()

In [None]:
y = 'total_bytes'

In [None]:
ax = sns.boxplot(x = target_name, y = y, data = tmp, order = tmp[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(tmp[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
tmp[tmp.loc[:, target_name] == 'NC'][y].max()

In [None]:
tmp[tmp.loc[:, target_name] == '4G_BACKHAUL_CONGESTION'][y].max()

In [None]:
ax = sns.boxplot(x = target_name, y = y, data = tmp1, order = tmp1[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(tmp1[target_name].value_counts().sort_values(ascending = False).index, rotation = 90);

In [None]:
tmp1[tmp1.loc[:, target_name] == 'NC'][y].max()

In [None]:
tmp1[tmp1.loc[:, target_name] == '4G_BACKHAUL_CONGESTION'][y].max()

### NOTE
- SC >= 4000, Total Bytes >= 12000 :: Congestion (3)

In [None]:
all_df = tmp[tmp.loc[:, ['subscriber_count', 'total_bytes']].apply(lambda x : x['subscriber_count'] < 3000 and x['total_bytes'] < 110000, axis = 1)]

In [None]:
all_df.__len__()

In [None]:
all_df.reset_index(drop = True, inplace = True)

In [None]:
len(tmp[tmp.loc[:, 'subscriber_count'] < 4000])

In [None]:
len(tmp[tmp.loc[:, 'total_bytes'] < 120000])

In [None]:
assert(all_df[all_df.loc[:, target_name] == 'NC'].__len__() == train_df[train_df.loc[:, target_name] == 'NC'].__len__())

In [None]:
three_df = tmp[tmp.loc[:, ['subscriber_count', 'total_bytes']].apply(lambda x : x['subscriber_count'] >= 3000 or x['total_bytes'] >= 110000, axis = 1)]

In [None]:
assert(three_df[three_df.loc[:, target_name] != 'NC'].__len__() == three_df[three_df.loc[:, target_name] != 'NC'].__len__())

In [None]:
fig = plt.figure(figsize = (6, 6))
total = all_df.shape[0]
ax = sns.countplot(x = target_name, data = all_df, order = all_df[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(all_df[target_name].value_counts().sort_values(ascending = False).index, rotation = 90)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(p.get_height() * 100 / total), (p.get_x() + 0.1, p.get_height() + 15))

In [None]:
all_df['labels'] = all_df[target_name].apply(lambda x : 0 if x == 'NC' else 1)

In [None]:
fig = plt.figure(figsize = (6, 6))
total = all_df.shape[0]
ax = sns.countplot(x = 'labels', data = all_df, order = all_df['labels'].value_counts().sort_values(ascending = False).index)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(p.get_height() * 100 / total), (p.get_x() + 0.3, p.get_height() + 15))

In [None]:
bin_df = all_df.drop(columns = ['cell_name', 'par_year', 'par_month', 'par_day', 'par_hour', 'par_min', 'Congestion_Type'])
bin_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
def label_encode(df, col):
    lb = LabelEncoder()
    df[col] = lb.fit_transform(df[col])
    return lb

In [None]:
label_encode(bin_df, 'ran_vendor');

In [None]:
for col in bin_df.columns:
    normalize_col(3, bin_df, col)

In [None]:
X = bin_df.drop(columns = 'labels').values
y = bin_df['labels']

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 4242, stratify = y)

In [None]:
target_names = ['NC', 'CONGESTION']

In [None]:
results = []
for clf, name in (
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'lbfgs', multi_class = 'multinomial', max_iter = 1000, class_weight='balanced'), "LR_LBFGS"),
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'saga', multi_class = 'multinomial', max_iter = 1000, penalty = 'l1',class_weight='balanced'), "LR_SAGA"),
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'newton-cg', multi_class = 'multinomial', max_iter = 1000,class_weight='balanced'), "LR_NEWTON-CG"),
        (SVC(gamma = 'scale',class_weight='balanced'), "SVC"),
        (RidgeClassifier(tol=1e-2, solver="lsqr", max_iter=300,class_weight='balanced'), "Ridge Classifier"),
        (Perceptron(max_iter=300, tol=1e-3, n_jobs=-1,class_weight='balanced'), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=2000, tol=1e-5, n_jobs=-1, class_weight='balanced'), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=5, n_jobs=-1, weights='distance'), "kNN"),
        (RandomForestClassifier(n_estimators=100, n_jobs = -1,class_weight='balanced'), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf,name,x_train, x_valid))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,max_iter=3000,class_weight='balanced',
                                       tol=1e-4),'LinearSVC',x_train, x_valid))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=2000, tol=1e-4,class_weight='balanced',
                                           penalty=penalty),'SGDClassifer_'+penalty,x_train, x_valid))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=1000, tol=1e-4,class_weight='balanced',
                                       penalty="elasticnet"),'SGDClassifer_elasticnet',x_train, x_valid))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, max_iter = 3000,class_weight='balanced',
                                                  tol=1e-4, ))),
  ('classification', LinearSVC(penalty="l2", class_weight = 'balanced'))]),'pipeline',x_train, x_valid))

In [None]:
mul_df = all_df.drop(columns = ['cell_name', 'par_year', 'par_month', 'par_day', 'par_hour', 'par_min'])

In [None]:
label_encode(mul_df, 'ran_vendor');

In [None]:
lb = label_encode(mul_df, 'Congestion_Type')

In [None]:
X = mul_df.drop(columns = 'Congestion_Type')
y = mul_df['Congestion_Type'].values

In [None]:
for col in X.columns:
    normalize_col(3, X, col)

In [None]:
X = X.values

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 4242, stratify = y)

In [None]:
target_names = lb.classes_
target_names

In [None]:
results = []
for clf, name in (
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'lbfgs', multi_class = 'multinomial', max_iter = 1000, class_weight='balanced'), "LR_LBFGS"),
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'saga', multi_class = 'multinomial', max_iter = 1000, penalty = 'l1',class_weight='balanced'), "LR_SAGA"),
        (LogisticRegression(C = 2.0, tol = 1e-5, random_state = 420, solver = 'newton-cg', multi_class = 'multinomial', max_iter = 1000,class_weight='balanced'), "LR_NEWTON-CG"),
        (SVC(gamma = 'scale',class_weight='balanced'), "SVC"),
        (RidgeClassifier(tol=1e-2, solver="lsqr", max_iter=300,class_weight='balanced'), "Ridge Classifier"),
        (Perceptron(max_iter=300, tol=1e-3, n_jobs=-1,class_weight='balanced'), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=2000, tol=1e-5, n_jobs=-1, class_weight='balanced'), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=5, n_jobs=-1, weights='distance'), "kNN"),
        (RandomForestClassifier(n_estimators=100, n_jobs = -1,class_weight='balanced'), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf,name,x_train, x_valid))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,max_iter=3000,class_weight='balanced',
                                       tol=1e-4),'LinearSVC',x_train, x_valid))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=2000, tol=1e-4,class_weight='balanced',
                                           penalty=penalty),'SGDClassifer_'+penalty,x_train, x_valid))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=1000, tol=1e-4,class_weight='balanced',
                                       penalty="elasticnet"),'SGDClassifer_elasticnet',x_train, x_valid))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, max_iter = 3000,class_weight='balanced',
                                                  tol=1e-4, ))),
  ('classification', LinearSVC(penalty="l2", class_weight = 'balanced'))]),'pipeline',x_train, x_valid))

## Going to add the binary labels predicted as col and predict 4 classes

In [None]:
fig = plt.figure(figsize = (6, 6))
total = three_df.shape[0]
ax = sns.countplot(x = target_name, data = three_df, order = three_df[target_name].value_counts().sort_values(ascending = False).index)
ax.set_xticklabels(three_df[target_name].value_counts().sort_values(ascending = False).index, rotation = 90)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(p.get_height() * 100 / total), (p.get_x() + 0.1, p.get_height() + 15))

## Binary Classification

In [None]:
# no_cgs_df = train_df[train_df.loc[:, target_name] == 'NC']
# cgs_df = train_df[train_df.loc[:, target_name] != 'NC']