In [59]:
# import basic packages for managing dataframe
import pandas as pd
import numpy as np
from datetime import datetime

# import visualization packages
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


# regardless warnings
import warnings 
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")

# import model package 
# regression and classification
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC

#clustering
from sklearn.cluster import KMeans

# import measurement package
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, f1_score
from sklearn import metrics


# PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#for handling imbalanced data 
import imblearn

# Run preproccesing pipeline or bring in data
#### ---> brought in training data from Microsoft Teams

In [60]:
#bring in data 
df_train = pd.read_csv('train.csv',engine='python')
df_test = pd.read_csv('test.csv',engine='python')

### remove those row with tags not included in top 200 tag list

In [61]:
def remove_no_tag(df):
    mask = (df.Tags=='[]') #create vector with 1 element for each row of the df (either a 1 or a 0)
    df_new = df[~mask] #give me all the rows where this mask conditiion does not occur
    return df_new

In [62]:
df_train_new = remove_no_tag(df_train)
df_test_new = remove_no_tag(df_test)

In [63]:
len(df_train_new)

1627

# Shape Y and X

In [64]:
columns_not_include = ['RECORDID','Tags','Tag_Count' 'Vec_Tags', 'Body', 'Body_Lemm',
       'Body_Lemm_lower', 'Body_Token','Body_Length','Title', 'Title_Lemm',
       'Title_Lemm_lower', 'Title_Token']

# split train and test
X_train = df_train_new.loc[:, ~df_train_new.columns.isin(columns_not_include)] #new bitwise mask 

X_test = df_test_new.loc[:, ~df_test_new.columns.isin(columns_not_include)]

In [65]:
X_train= X_train.iloc[:,2:]
X_test= X_test.iloc[:,2:]

In [9]:
#print(X_train)

# Run PCA to reduce dimensions

In [66]:
def trans_pca(df, variance):

    scaler = StandardScaler()
    # fit on training set only.
    scaler.fit(df)
    # apply transform to both the training set and the test set.
    df = scaler.transform(df)

    # variance = 0.9 indicates retaining 90% of total variance
    pca = PCA(variance)
    pca.fit(df)
    # print(pca.n_components_)
    df = pca.transform(df)

    return df

In [67]:
# Run PCA for the total dataset(Train+Test)

#X_total = pd.concat([X_train, X_test])
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
pca = PCA(0.9)
pca.fit(X_train)
X_train = pd.DataFrame(pca.transform(X_train))
X_test = pd.DataFrame(pca.transform(X_test))

# SVM + Tomek Links

In [77]:
type(X_train)

pandas.core.frame.DataFrame

In [68]:
#change from dataframe to numpy array to use in tomek links sampler
X_train_arr= np.array(X_train)
#print(X_train_arr)

X_test_arr= np.array(X_test)
#print(X_test_arr)

In [69]:
#accuracy_svm_tl = []
#f1_score_svm_tl = []
#roc_svm_tl = []

# compute how many 1 in each model
sum_train=[]
sum_test=[]

# Try first 10 models
for num_models in range(0,10): 
    # create train and test
    y_train = []
    y_test = []
    for i  in range(len(df_train_new)):
        y_train.append(int(df_train_new['Vec_Tags'].iloc[i].split(']')[0].split('[')[1].split(',')[num_models].rstrip(' ').lstrip(' ')))
    for j  in range(len(df_test_new)):
        y_test.append(int(df_test_new['Vec_Tags'].iloc[j].split(']')[0].split('[')[1].split(',')[num_models].rstrip(' ').lstrip(' ')))
    
    sum_train.append(sum(y_train))
    sum_test.append(sum(y_test))

In [70]:
y_train_arr= np.array(y_train)
y_test_arr= np.array(y_test)

### Tomek Links- Undersampling

In [13]:
#Tomek Links-- sampling_strategy= 'not minority'
from imblearn.under_sampling import TomekLinks

print('Before UnderSampling, the shape of train_X: {}'.format(X_train_arr.shape)) 
print('Before UnderSampling, the shape of train_y: {} \n'.format(y_train_arr.shape)) 

print("Before UnderSampling, counts of label '1': {}".format(sum(y_train_arr == 1))) 
print("Before UnderSampling, counts of label '0': {} \n".format(sum(y_train_arr == 0)))

tl= TomekLinks(sampling_strategy='not minority')
X_train_uns, y_train_uns = tl.fit_resample(X_train_arr, y_train_arr) #uns= denotes undersampled data

print('After UnderSampling, the shape of train_X: {}'.format(X_train_uns.shape)) 
print('After UnderSampling, the shape of train_y: {} \n'.format(y_train_uns.shape)) 
  
print("After UnderSampling, counts of label '1': {}".format(sum(y_train_uns == 1))) 
print("After UnderSampling, counts of label '0': {}".format(sum(y_train_uns == 0))) 

Before UnderSampling, the shape of train_X: (1627, 358)
Before UnderSampling, the shape of train_y: (1627,) 

Before UnderSampling, counts of label '1': 31
Before UnderSampling, counts of label '0': 1596 

After UnderSampling, the shape of train_X: (1627, 358)
After UnderSampling, the shape of train_y: (1627,) 

After UnderSampling, counts of label '1': 31
After UnderSampling, counts of label '0': 1596


In [72]:
#Tomek Links-- sampling_strategy= 'not majority'
from imblearn.under_sampling import TomekLinks

print('Before UnderSampling, the shape of train_X: {}'.format(X_train_arr.shape)) 
print('Before UnderSampling, the shape of train_y: {} \n'.format(y_train_arr.shape)) 

print("Before UnderSampling, counts of label '1': {}".format(sum(y_train_arr == 1))) 
print("Before UnderSampling, counts of label '0': {} \n".format(sum(y_train_arr == 0)))

tl= TomekLinks(sampling_strategy='not majority')
X_train_uns, y_train_uns = tl.fit_resample(X_train_arr, y_train_arr) #uns= denotes undersampled data

print('After UnderSampling, the shape of train_X: {}'.format(X_train_uns.shape)) 
print('After UnderSampling, the shape of train_y: {} \n'.format(y_train_uns.shape)) 
  
print("After UnderSampling, counts of label '1': {}".format(sum(y_train_uns == 1))) 
print("After UnderSampling, counts of label '0': {}".format(sum(y_train_uns == 0))) 

Before UnderSampling, the shape of train_X: (1627, 358)
Before UnderSampling, the shape of train_y: (1627,) 

Before UnderSampling, counts of label '1': 31
Before UnderSampling, counts of label '0': 1596 

After UnderSampling, the shape of train_X: (1627, 358)
After UnderSampling, the shape of train_y: (1627,) 

After UnderSampling, counts of label '1': 31
After UnderSampling, counts of label '0': 1596


##### Lmao it literally made no difference, even when 'not minority'  and 'not majority' sampling strategy is specified

# Linear SVM and SMOTE

In [135]:
# Recreate X_train and X_test again
X_train = df_train_new.loc[:, ~df_train_new.columns.isin(columns_not_include)]

X_test = df_test_new.loc[:, ~df_test_new.columns.isin(columns_not_include)]


X_train= X_train.iloc[:,2:]
X_test= X_test.iloc[:,2:]

In [136]:
# Run PCA for the total dataset(Train+Test)

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
pca = PCA(0.9)
pca.fit(X_train)
X_train = pd.DataFrame(pca.transform(X_train))
X_test = pd.DataFrame(pca.transform(X_test))

In [137]:
#change from dataframe to numpy array to use in smote
X_train_arr= np.array(X_train)
#print(X_train_arr)

X_test_arr= np.array(X_test)
#print(X_test_arr)

In [138]:
from imblearn.over_sampling import SMOTE 

accuracy_svm_sm = [] #sm- denotes using smote data set
f1_score_svm_sm = []
roc_svm_sm = []

# compute how many 1 in each model
sum_train=[]
sum_test=[]

# Try first 10 models
for num_models in range(0,10): 
    # create train and test
    y_train = []
    y_test = []
    for i  in range(len(df_train_new)):
        y_train.append(int(df_train_new['Vec_Tags'].iloc[i].split(']')[0].split('[')[1].split(',')[num_models].rstrip(' ').lstrip(' ')))
    for j  in range(len(df_test_new)):
        y_test.append(int(df_test_new['Vec_Tags'].iloc[j].split(']')[0].split('[')[1].split(',')[num_models].rstrip(' ').lstrip(' ')))
    
    sum_train.append(sum(y_train))
    sum_test.append(sum(y_test))
    
    y_train_arr= np.array(y_train)
    y_test_arr= np.array(y_test)
    
    #SMOTE
    sm = SMOTE(random_state = 2) 
    X_train_ovs, y_train_ovs = sm.fit_sample(X_train_arr, y_train_arr.ravel()) #ovs= denotes oversampled data
    
    #Linear SVM
    svm_sm = LinearSVC().fit(X_train_ovs, y_train_ovs)
    y_pred_svm_sm_test = svm_sm.predict(X_test)
    y_pred_svm_sm_train = svm_sm.predict(X_train_ovs)

    #y_score_svm = svm.predict_proba(X_test)[:,1]
    acc_svm_sm = accuracy_score(y_test, y_pred_svm_sm_test)
    f1_score_svm_sm_result = metrics.f1_score(y_test, y_pred_svm_sm_test)

    #roc_svm_result = metrics.roc_auc_score(y_test, y_score_svm)
    accuracy_svm_sm.append(acc_svm_sm)
    f1_score_svm_sm.append(f1_score_svm_sm_result)

In [139]:
data_tuples = list(zip(accuracy_svm_sm,f1_score_svm_sm,sum_train,sum_test))
result_svm_sm = pd.DataFrame(data_tuples,columns=['accuracy','f1_score','sum_train','sum_test'])
result_svm_sm['sum_train'] = result_svm_sm['sum_train']/len(y_train_ovs)
result_svm_sm['sum_test'] = result_svm_sm['sum_test']/len(y_test)
result_svm_sm

Unnamed: 0,accuracy,f1_score,sum_train,sum_test
0,0.97035,0.819672,0.044173,0.067385
1,0.973046,0.821429,0.031642,0.061995
2,0.96496,0.763636,0.031328,0.059299
3,0.948787,0.732394,0.030702,0.09434
4,0.948787,0.732394,0.030702,0.09434
5,0.981132,0.758621,0.020363,0.03504
6,0.975741,0.689655,0.015977,0.02965
7,0.986523,0.666667,0.014411,0.021563
8,0.894879,0.133333,0.010025,0.016173
9,0.997305,0.947368,0.009712,0.024259


In [140]:
 result_svm_sm["f1_score"].mean()

0.706517008172844

### ------------------------------------------------------------------------------------------------------------------------------------------------
### Isolating this step to show what Smote does

##### Smote- Oversampling

In [20]:
#Smote
print('Before UnderSampling, the shape of train_X: {}'.format(X_train_arr.shape)) 
print('Before UnderSampling, the shape of train_y: {} \n'.format(y_train_arr.shape)) 

print("Before OverSampling, counts of label '1': {}".format(sum(y_train_arr == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train_arr == 0))) 
  
# import SMOTE module from imblearn library 
# pip install imblearn (if you don't have imblearn in your system) 
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state = 2) 
X_train_ovs, y_train_ovs = sm.fit_sample(X_train_arr, y_train_arr.ravel()) #ovs= denotes oversampled data
  
print('After OverSampling, the shape of train_X: {}'.format(X_train_ovs.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_ovs.shape)) 
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_ovs == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_ovs == 0))) 

Before UnderSampling, the shape of train_X: (1627, 358)
Before UnderSampling, the shape of train_y: (1627,) 

Before OverSampling, counts of label '1': 31
Before OverSampling, counts of label '0': 1596 

After OverSampling, the shape of train_X: (3192, 358)
After OverSampling, the shape of train_y: (3192,) 

After OverSampling, counts of label '1': 1596
After OverSampling, counts of label '0': 1596


### ------------------------------------------------------------------------------------------------------------------------------------------------

# SMOTE and AdaBoost

In [141]:
# Recreate X_train and X_test again
X_train = df_train_new.loc[:, ~df_train_new.columns.isin(columns_not_include)]

X_test = df_test_new.loc[:, ~df_test_new.columns.isin(columns_not_include)]


X_train= X_train.iloc[:,2:]
X_test= X_test.iloc[:,2:]

In [142]:
# Run PCA for the total dataset(Train+Test)

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
pca = PCA(0.9)
pca.fit(X_train)
X_train = pd.DataFrame(pca.transform(X_train))
X_test = pd.DataFrame(pca.transform(X_test))

In [143]:
#change from dataframe to numpy array to use in smote
X_train_arr= np.array(X_train)
X_test_arr= np.array(X_test)

In [144]:
def prettywrite(l,filename):
    with open(filename, "a+") as f:
        f.write(prettyprint(l))

### Adaboost with using SVM model as base estimator

In [145]:
accuracy_abc_ovs = []
f1_score_abc_ovs = []
roc_abc_ovs = []

# compute how many 1 in each model
sum_train=[]
sum_test=[]

# Try first 10 models
for num_models in range(0,10): 
    # create train and test
    y_train = []
    y_test = []
    for i  in range(len(df_train_new)):
        y_train.append(int(df_train_new['Vec_Tags'].iloc[i].split(']')[0].split('[')[1].split(',')[num_models].rstrip(' ').lstrip(' ')))
    for j  in range(len(df_test_new)):
        y_test.append(int(df_test_new['Vec_Tags'].iloc[j].split(']')[0].split('[')[1].split(',')[num_models].rstrip(' ').lstrip(' ')))
    
    sum_train.append(sum(y_train))
    sum_test.append(sum(y_test))
    
    y_train_arr= np.array(y_train)
    y_test_arr= np.array(y_test)
    
    # import SMOTE module from imblearn library 
    # pip install imblearn (if you don't have imblearn in your system) 
    from imblearn.over_sampling import SMOTE 
    sm = SMOTE(random_state = 2) 
    X_train_ovs, y_train_ovs = sm.fit_sample(X_train_arr, y_train_arr.ravel()) #ovs= denotes oversampled data
    
    
    #Adaboost
    from sklearn.ensemble import AdaBoostClassifier
    
    #use svm_sm (the previous model) as the base estimator 

    # Create adaboost classifer object
    abc =AdaBoostClassifier(n_estimators=50, base_estimator=svm_sm, algorithm='SAMME', learning_rate=1)

    # Train Adaboost Classifer with oversampled data
    abc_ovs = abc.fit(X_train_ovs, y_train_ovs)
    y_pred_abc_ovs_test = abc_ovs.predict(X_test)
    y_pred_abc_ovs_train = abc_ovs.predict(X_train_ovs)

    #y_score_ab_ovs = svm.predict_proba(X_test)[:,1]
    acc_abc_ovs = accuracy_score(y_test, y_pred_abc_ovs_test)
    f1_score_abc_ovs_result = metrics.f1_score(y_test, y_pred_abc_ovs_test)

    #roc_svm_result = metrics.roc_auc_score(y_test, y_score_svm)
    accuracy_abc_ovs.append(acc_abc_ovs)
    f1_score_abc_ovs.append(f1_score_abc_ovs_result)
    
    l=[acc_abc_ovs,f1_score_abc_ovs_result]
    
    prettywrite(l,"results.txt")

In [146]:
data_tuples = list(zip(accuracy_abc_ovs,f1_score_abc_ovs,sum_train,sum_test))
result_abc_ovs = pd.DataFrame(data_tuples,columns=['accuracy','f1_score','sum_train','sum_test'])
result_abc_ovs['sum_train'] = result_abc_ovs['sum_train']/len(y_train_ovs)
result_abc_ovs['sum_test'] = result_abc_ovs['sum_test']/len(y_test)
result_abc_ovs

Unnamed: 0,accuracy,f1_score,sum_train,sum_test
0,0.962264,0.78125,0.044173,0.067385
1,0.967655,0.785714,0.031642,0.061995
2,0.967655,0.777778,0.031328,0.059299
3,0.946092,0.72973,0.030702,0.09434
4,0.946092,0.72973,0.030702,0.09434
5,0.986523,0.814815,0.020363,0.03504
6,0.986523,0.8,0.015977,0.02965
7,0.991914,0.823529,0.014411,0.021563
8,0.916442,0.205128,0.010025,0.016173
9,0.997305,0.947368,0.009712,0.024259


In [147]:
 result_abc_ovs["f1_score"].mean()

0.739504237571188

### Adaboost using default base estimator--- DecisionTreeClassifier(max_depth=1)

In [129]:
# Recreate X_train and X_test again
X_train = df_train_new.loc[:, ~df_train_new.columns.isin(columns_not_include)]

X_test = df_test_new.loc[:, ~df_test_new.columns.isin(columns_not_include)]


X_train= X_train.iloc[:,2:]
X_test= X_test.iloc[:,2:]

In [130]:
# Run PCA for the total dataset(Train+Test)

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
pca = PCA(0.9)
pca.fit(X_train)
X_train = pd.DataFrame(pca.transform(X_train))
X_test = pd.DataFrame(pca.transform(X_test))

In [131]:
#change from dataframe to numpy array to use in smote
X_train_arr= np.array(X_train)
X_test_arr= np.array(X_test)

In [132]:
#d for default
accuracy_abc_ovs_d = []
f1_score_abc_ovs_d = []
roc_abc_ovs_d = []

# compute how many 1 in each model
sum_train=[]
sum_test=[]

# Try first 10 models
for num_models in range(0,10): 
    # create train and test
    y_train = []
    y_test = []
    for i  in range(len(df_train_new)):
        y_train.append(int(df_train_new['Vec_Tags'].iloc[i].split(']')[0].split('[')[1].split(',')[num_models].rstrip(' ').lstrip(' ')))
    for j  in range(len(df_test_new)):
        y_test.append(int(df_test_new['Vec_Tags'].iloc[j].split(']')[0].split('[')[1].split(',')[num_models].rstrip(' ').lstrip(' ')))
    
    sum_train.append(sum(y_train))
    sum_test.append(sum(y_test))
    
    y_train_arr= np.array(y_train)
    y_test_arr= np.array(y_test)
    
    # import SMOTE module from imblearn library 
    # pip install imblearn (if you don't have imblearn in your system) 
    from imblearn.over_sampling import SMOTE 
    sm = SMOTE(random_state = 2) 
    X_train_ovs, y_train_ovs = sm.fit_sample(X_train_arr, y_train_arr.ravel()) #ovs= denotes oversampled data
    
    
    #Adaboost
    from sklearn.ensemble import AdaBoostClassifier

    # Create adaboost classifer object
    abc_d =AdaBoostClassifier(n_estimators=50, learning_rate=1)

    # Train Adaboost Classifer with oversampled data
    abc_ovs_d = abc_d.fit(X_train_ovs, y_train_ovs)
    y_pred_abc_ovs_d_test = abc_ovs_d.predict(X_test)
    y_pred_abc_ovs_d_train = abc_ovs_d.predict(X_train_ovs)

    #y_score_ab_ovs = svm.predict_proba(X_test)[:,1]
    acc_abc_ovs_d = accuracy_score(y_test, y_pred_abc_ovs_d_test)
    f1_score_abc_ovs_d_result = metrics.f1_score(y_test, y_pred_abc_ovs_d_test)

    #roc_svm_result = metrics.roc_auc_score(y_test, y_score_svm)
    accuracy_abc_ovs_d.append(acc_abc_ovs_d)
    f1_score_abc_ovs_d.append(f1_score_abc_ovs_d_result)
    
    l=[acc_abc_ovs_d,f1_score_abc_ovs_d_result]
    
    prettywrite(l,"results.txt")

In [133]:
data_tuples = list(zip(accuracy_abc_ovs_d,f1_score_abc_ovs_d,sum_train,sum_test))
result_abc_ovs_d = pd.DataFrame(data_tuples,columns=['accuracy','f1_score','sum_train','sum_test'])
result_abc_ovs_d['sum_train'] = result_abc_ovs_d['sum_train']/len(y_train_ovs)
result_abc_ovs_d['sum_test'] = result_abc_ovs_d['sum_test']/len(y_test)
result_abc_ovs_d

Unnamed: 0,accuracy,f1_score,sum_train,sum_test
0,0.986523,0.909091,0.044173,0.067385
1,0.986523,0.883721,0.031642,0.061995
2,0.978437,0.818182,0.031328,0.059299
3,0.973046,0.848485,0.030702,0.09434
4,0.973046,0.848485,0.030702,0.09434
5,0.994609,0.928571,0.020363,0.03504
6,0.989218,0.833333,0.015977,0.02965
7,0.986523,0.666667,0.014411,0.021563
8,0.97035,0.352941,0.010025,0.016173
9,0.991914,0.842105,0.009712,0.024259


In [134]:
 result_abc_ovs_d["f1_score"].mean()

0.7931581222674894

### ----------------------------------------------------------------------------------------------------------------------------------------------------------

### Pretty Printer prep

In [28]:
l= [0.091,0.992,.844]
l[-1]
#str(l)
#brackets are just seen as characters in a string
#pstr= ",".join(map(str,l)) + ";\n" #map string converter across every element of l-- common is the joiner (value delimiter)

0.844

In [2]:
with open("adaboost_results.txt", "a+") as f:
    f.write("test bitch\n")

In [23]:
def prettyprint(l):
    pstr= ",".join(map(str,l)) + ";\n"
    return pstr

def prettywrite(l,filename):
    with open(filename, "a+") as f:
        f.write(prettyprint(l))


In [18]:
prettyprint(l)

'0.091,0.992,0.844;\n'

In [27]:
prettywrite(l,"text.txt")