In [61]:
import pandas as pd
import numpy as np
import re
import nltk
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt 
from nltk import PorterStemmer
# nltk.download("wordnet")
from nltk import WordNetLemmatizer
import spacy

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chayan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data = pd.read_excel("Email_Classificaion.xlsx")
data.head()

Unnamed: 0,Tower Name,Application Name,Email Subject,Flag
0,Core Pharmacy,PharmRDS-Accredo RealTime Messages,Production - Accredo Real Time Order Process S...,Actionable
1,Core Pharmacy,IVR - Web Service,Splunk Alert: SO IVR - Pharmacy Migration Coor...,Actionable
2,Core Pharmacy,ISAM- Web Service,Splunk Alert: ISAM - Slowness,Actionable
3,Core Pharmacy,Migration Co-Ordinaor,Splunk Alert: Migration_Coordinator_500 Clone,Actionable
4,Core Pharmacy,Claim Realm - Web Service,Splunk Alert: Claim Realm Service - Slowness A...,Actionable


In [3]:
data.shape

(73, 4)

In [4]:
data['Flag'].value_counts()

Actionable        48
Non Actionable    18
Name: Flag, dtype: int64

In [5]:
data.isnull().sum()

Tower Name          0
Application Name    0
Email Subject       0
Flag                7
dtype: int64

In [6]:
# ger rid of na values in Flag
data = data[~data['Flag'].isna()]
print(data.shape)
data.head()

(66, 4)


Unnamed: 0,Tower Name,Application Name,Email Subject,Flag
0,Core Pharmacy,PharmRDS-Accredo RealTime Messages,Production - Accredo Real Time Order Process S...,Actionable
1,Core Pharmacy,IVR - Web Service,Splunk Alert: SO IVR - Pharmacy Migration Coor...,Actionable
2,Core Pharmacy,ISAM- Web Service,Splunk Alert: ISAM - Slowness,Actionable
3,Core Pharmacy,Migration Co-Ordinaor,Splunk Alert: Migration_Coordinator_500 Clone,Actionable
4,Core Pharmacy,Claim Realm - Web Service,Splunk Alert: Claim Realm Service - Slowness A...,Actionable


In [7]:
data['Flag'].value_counts()

Actionable        48
Non Actionable    18
Name: Flag, dtype: int64

In [10]:
data_copy = data.copy()

In [80]:
data_copy['combined'] = data_copy.apply(lambda x : f"{x[0]} {x[1]} {x[2]}",axis = 1)

In [81]:
data_copy.head()

Unnamed: 0,Tower Name,Application Name,Email Subject,Flag,combined
0,Core Pharmacy,PharmRDS-Accredo RealTime Messages,Production - Accredo Real Time Order Process S...,Actionable,Core Pharmacy PharmRDS-Accredo RealTime Messag...
1,Core Pharmacy,IVR - Web Service,Splunk Alert: SO IVR - Pharmacy Migration Coor...,Actionable,Core Pharmacy IVR - Web Service Splunk Alert: ...
2,Core Pharmacy,ISAM- Web Service,Splunk Alert: ISAM - Slowness,Actionable,Core Pharmacy ISAM- Web Service Splunk Alert: ...
3,Core Pharmacy,Migration Co-Ordinaor,Splunk Alert: Migration_Coordinator_500 Clone,Actionable,Core Pharmacy Migration Co-Ordinaor Splunk Ale...
4,Core Pharmacy,Claim Realm - Web Service,Splunk Alert: Claim Realm Service - Slowness A...,Actionable,Core Pharmacy Claim Realm - Web Service Splunk...


## Functions

### Preprocessing

In [77]:
def Preprocessing_stemming(df,column_name):
    # Email Subject column
    df[column_name] = df[column_name].apply(lambda x: re.sub('[^A-Za-z]'," ",x))
    df[column_name] = df[column_name].replace([" ","  ","   "]," ",regex = True)
    stopwords = set(STOPWORDS) 
    stemmer = PorterStemmer()
    df[column_name] = df[column_name].apply(lambda x : x.lower())
    df[column_name] = df[column_name].apply(lambda x : " ".join(word for word in x if len(word) > 2))
    df[column_name] = df[column_name].apply(lambda x : " ".join([word for word in x.split() if word not in stopwords]))
    df[column_name] = df[column_name].apply(lambda x : stemmer.stem(x)) 
    return df[column_name]


def Preprocessing_lemmatization(df,column_name):
    # Email Subject column
    df[column_name] = df[column_name].apply(lambda x: re.sub('[^A-Za-z]'," ",x))
    df[column_name] = df[column_name].replace([" ","  ","   "]," ",regex = True)
    stopwords = set(STOPWORDS) 
    lemmatizer = WordNetLemmatizer()
    df[column_name] = df[column_name].apply(lambda x : x.lower())
    df[column_name] = df[column_name].apply(lambda x : " ".join(word for word in x if len(word) > 2))
    df[column_name] = df[column_name].apply(lambda x : " ".join([word for word in x.split() if word not in stopwords]))
    df[column_name] = df[column_name].apply(lambda x : lemmatizer.lemmatize(x)) 
    return df[column_name]


def Preprocessing_lemmatization_spacy(df,column_name):
    # Email Subject column
    df[column_name] = df[column_name].apply(lambda x: re.sub('[^A-Za-z]'," ",x))
    df[column_name] = df[column_name].replace([" ","  ","   "]," ",regex = True)
    stopwords = set(STOPWORDS) 
    nlp = spacy.load("en_core_web_sm")
    df[column_name] = df[column_name].apply(lambda x : x.lower())
    df[column_name] = df[column_name].apply(lambda x : " ".join(word for word in x.split() if len(word) > 2))
    df[column_name] = df[column_name].apply(lambda x : " ".join([word for word in x.split() if word not in stopwords]))
    df[column_name] = df[column_name].apply(lambda x : " ".join(token.lemma_ for token in nlp(x))) 
    return df[column_name]


### Feature Extraction

In [106]:
from sklearn.feature_extraction.text import TfidfVectorizer 
import gensim

def feature_extraction_1(df_train,df_test = None):
    vectorizer = TfidfVectorizer(use_idf = True, ngram_range = (1,2),max_df = 5,min_df = 3,max_features = 100)
    train_data = vectorizer.fit_transform(df_train)
    if df_test != None:
        test_data = vectorizer.transform(df_test)
    else:
        test_data = None
        
    print("Features : \n", vectorizer.vocabulary_)
    return train_data,test_data



def word_embedding_feature(tokenized_data):
    wordvec_arrays = np.zeros((len(tokenized_data), 50)) 
    
    for i in range(len((tokenized_data))):
        size = len(tokenized_data[i])
        aggregated_wv = np.zeros(50)

        for word in tokenized_data[i]:
            try:
                aggregated_wv += model_w2v[word]
            except:
                aggregated_wv += np.zeros(50)

        aggregated_wv = aggregated_wv / size
        wordvec_arrays[i] = aggregated_wv
    return pd.DataFrame(wordvec_arrays) 


def feature_extraction_2(df_train,df_test = None):
    train_tokenized_mails = df_train.apply(lambda x : x.split()).reset_index(drop= 'first')
    model_w2v = gensim.models.Word2Vec(train_tokenized_mails,
                          size = 50,
                          window = 3,
                          min_count = 1,
                          sg = 1,
                          negative = 5,
                          workers = 4,
                          seed = 34)

    model_w2v.train(train_tokenized_mails, total_examples= len(df_train), epochs=20)
    
    print("Features -- \n", model_w2v.wv.vocab.keys())
    train_wordvec_df = word_embedding_feature(train_tokenized_mails)

    if df_test != None:
        test_tokenized_mails = df_test.apply(lambda x : x.split()).reset_index(drop= 'first')
        test_wordvec_df = word_embedding_feature(test_tokenized_mails)
    else:
        test_wordvec_df = None
    return train_wordvec_df,test_wordvec_df



def new_word_embedding_feature_with_tfidf(tokenized_data,tfidf_data):
    wordvec_arrays = np.zeros((len(tokenized_data), 50)) 
    
    for i in range(len((tokenized_data))):
        
        size = len(tfidf_data.loc[i,tfidf_data.iloc[i,:] != 0])
        if size == 0:
            size = 1
            
        aggregated_wv = np.zeros(50)

        for word in tokenized_data[i]:
            try:
                aggregated_wv += model_w2v[word] * tfidf_data[word][i]
            except:
                aggregated_wv += np.zeros(50)

        aggregated_wv = aggregated_wv / size
        wordvec_arrays[i] = aggregated_wv
    return pd.DataFrame(wordvec_arrays) 

def is_phrase_in(phrase, text):
        return re.search(r"\b{}\b".format(phrase), text, re.IGNORECASE) is not None


def replace_bigram(sentence,dict_):
    for bigram in dict_.keys():
        if is_phrase_in(bigram,sentence):
            sentence = sentence.replace(bigram,dict_[bigram])
    return sentence



def feature_extraction_3(df_train,df_test = None):
    doc = " ".join(sent for sent in df_train)
    wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 20,
                collocation_threshold = 20).generate(doc) 
    top_50_words = list(wordcloud.words_.keys())[:50]
    # changing bigrams to unigrams in X_train for word_embedding
    
    change_dict = {}
    for word in top_50_words:
        if len(word.split()) > 1:
            change_dict[word] = word.replace(" ","_")

    top_50_words_changed = [change_dict[word] if word in list(change_dict.keys()) else word for word in top_50_words] 
    

    X_train_copy = df_train.copy()
    X_train_copy = X_train_copy.apply(lambda x: replace_bigram(x,change_dict))
    
    # create tfidf based on these top_50_words
    tfidf_1 = TfidfVectorizer(vocabulary = top_50_words_changed)
    new_train_data = tfidf_1.fit_transform(X_train_copy)
    new_train_data = pd.DataFrame( new_train_data.toarray() , columns = top_50_words_changed)
    
    # create word_embedding 
    train_tokenized_mails = X_train_copy.apply(lambda x : x.split()).reset_index(drop= 'first')
    model_w2v = gensim.models.Word2Vec(train_tokenized_mails,
                          size = 50,
                          window = 3,
                          min_count = 1,
                          sg = 1,
                          negative = 5,
                          workers = 4,
                          seed = 34)
    model_w2v.train(train_tokenized_mails, total_examples= len(X_train_copy), epochs=20)
    print("Features -- \n", model_w2v.wv.vocab.keys())
    train_wordvec_df = new_word_embedding_feature_with_tfidf(train_tokenized_mails,new_train_data)
    
    
    if df_test!= None:
        X_test_copy = df_test.copy()
        X_test_copy = X_test_copy.apply(lambda x: replace_bigram(x,change_dict))
        new_test_data = tfidf_1.transform(X_test_copy)
        new_test_data = pd.DataFrame(new_test_data.toarray() , columns = top_50_words_changed)        
        test_tokenized_mails = X_test_copy.apply(lambda x : x.split()).reset_index(drop= 'first')
        test_wordvec_df = new_word_embedding_feature_with_tfidf(test_tokenized_mails,new_test_data)  
        
    else:
        test_wordvec_df = None
        
    return train_wordvec_df, test_wordvec_df

### Training

In [51]:
# MODEL TRAINING FUNCTION
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,classification_report
def train(X_tr,X_te,y_tr,y_te):
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import LinearSVC
    from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix
    from sklearn.model_selection import GridSearchCV
    ##### random forest classifier
    
   
    rf_clf = RandomForestClassifier()


    params = {"n_estimators" : [100,200,400],
             "max_depth": [2,3,4],
             "min_samples_split" : [2,3,4,5],
             "class_weight" : ["balanced", "balanced_subsample", None]}

    from sklearn.model_selection import GridSearchCV
    grid_search = GridSearchCV(rf_clf,param_grid= params, n_jobs=-1, verbose=1)

    grid_search.fit(X_tr,y_tr)
    
    best_rf_clf = grid_search.best_estimator_
    print("best rf features -- \n",best_rf_clf)
    print("\n")
    best_rf_clf.fit(X_tr,y_tr)

    ## prediction
    prediction_best_rf_train = best_rf_clf.predict(X_tr)

    print("---------------Training Result Random Forest--------------- \n \n")
    print(confusion_matrix(y_tr,prediction_best_rf_train))
    print("accuracy -- " , accuracy_score(y_tr,prediction_best_rf_train))
    print("precision_score -- " , precision_score(y_tr,prediction_best_rf_train))
    print("recall_score -- " , recall_score(y_tr,prediction_best_rf_train))
    print("f1_score -- " , f1_score(y_tr,prediction_best_rf_train))
    print("classification report --- \n",classification_report(y_tr,prediction_best_rf_train))
    
    prediction_best_rf_test = best_rf_clf.predict(X_te)

    print("---------------Testing Result Random Forest--------------- \n \n")
    print(confusion_matrix(y_te,prediction_best_rf_test))
    print("accuracy -- " , accuracy_score(y_te,prediction_best_rf_test))
    print("precision_score -- " , precision_score(y_te,prediction_best_rf_test))
    print("recall_score -- " , recall_score(y_te,prediction_best_rf_test))
    print("f1_score -- " , f1_score(y_te,prediction_best_rf_test))
    print("classification report --- \n",classification_report(y_te,prediction_best_rf_test))

    print("\n\n")
    ####### SVM classification

    svm_clf = LinearSVC()


    params = {'max_iter': (200,400,500,1000),
             "C" : (.001,.002,.005,.009,.01,.02,.05,1),
             "class_weight" : [None,'balanced']}

    from sklearn.model_selection import GridSearchCV
    grid_search = GridSearchCV(svm_clf,param_grid= params, n_jobs=-1, verbose=1)

   
    grid_search.fit(X_tr,y_tr)

    best_svm_clf = grid_search.best_estimator_
    print("best svm features -- \n",best_svm_clf)
    print("\n")
    
    best_svm_clf.fit(X_tr,y_tr)

    ## prediction

    prediction_best_svm_train = best_svm_clf.predict(X_tr)

    print("---------------Training Result SVM--------------- \n \n")
    print(confusion_matrix(y_tr,prediction_best_svm_train))
    print("accuracy -- " , accuracy_score(y_tr,prediction_best_svm_train))
    print("precision_score -- " , precision_score(y_tr,prediction_best_svm_train))
    print("recall_score -- " , recall_score(y_tr,prediction_best_svm_train))
    print("f1_score -- " , f1_score(y_tr,prediction_best_svm_train))
    print("classification report --- \n",classification_report(y_tr,prediction_best_svm_train))

    prediction_best_svm_test = best_svm_clf.predict(X_te)
    
    print("classification report --- \n",classification_report(y_te,prediction_best_rf_test))
    print("---------------Testing Result SVM--------------- \n \n")
    print(confusion_matrix(y_te,prediction_best_svm_test))
    print("accuracy -- " , accuracy_score(y_te,prediction_best_svm_test))
    print("precision_score -- " , precision_score(y_te,prediction_best_svm_test))
    print("recall_score -- " , recall_score(y_te,prediction_best_svm_test))
    print("f1_score -- " , f1_score(y_te,prediction_best_svm_test))

# Feature extraction after split

## Work Done -- 1

### Preprocessing

In [24]:
data_copy["combined"] = Preprocessing_lemmatization(data_copy, "combined")

### Train Test Split

In [26]:
from sklearn.model_selection import train_test_split
X = data_copy["combined"]
y = data_copy["Flag"].map({"Actionable" : 0, "Non Actionable" : 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

### Feature Extraction

In [58]:
train_features, test_features = feature_extraction_2(X_train, X_test)

### Training

In [59]:
train(train_features, test_features,  y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  1.7min finished


best rf features -- 
 RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=2, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False)


---------------Training Result Random Forest--------------- 
 

[[27  0]
 [12  0]]
accuracy --  0.6923076923076923
precision_score --  0.0
recall_score --  0.0
f1_score --  0.0
classification report --- 
               precision    recall  f1-score   support

           0       0.69      1.00      0.82        27
           1       0.00      0.00      0.00        12

   micro avg       0.69      0.69      0.69        39
   macro avg       0.35      0.50      0.41        39
weighted avg       0.48      0.69      0.57        39

---------------T

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


best svm features -- 
 LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=200,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


---------------Training Result SVM--------------- 
 

[[27  0]
 [12  0]]
accuracy --  0.6923076923076923
precision_score --  0.0
recall_score --  0.0
f1_score --  0.0
classification report --- 
               precision    recall  f1-score   support

           0       0.69      1.00      0.82        27
           1       0.00      0.00      0.00        12

   micro avg       0.69      0.69      0.69        39
   macro avg       0.35      0.50      0.41        39
weighted avg       0.48      0.69      0.57        39

classification report --- 
               precision    recall  f1-score   support

           0       0.78      1.00      0.88        21
           1       0.00      0.00      0.00         6

   micro avg       0.78      0.78      0.78  

[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:    1.2s finished
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Work Done -- 2

### Feature Extraction

In [55]:
train_features, test_features = feature_extraction_3(X_train, X_test)

### Training

In [38]:
train(train_features, test_features,  y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:   47.3s finished


best rf features -- 
 RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=2, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False)


---------------Training Result Random Forest--------------- 
 

[[27  0]
 [12  0]]
accuracy --  0.6923076923076923
precision_score --  0.0
recall_score --  0.0
f1_score --  0.0
---------------Testing Result Random Forest--------------- 
 

[[21  0]
 [ 6  0]]
accuracy --  0.7777777777777778
precision_score --  0.0
recall_score --  0.0
f1_score --  0.0



Fitting 3 folds for each of 64 candidates, totalling 192 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


best svm features -- 
 LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=200,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


---------------Training Result SVM--------------- 
 

[[27  0]
 [12  0]]
accuracy --  0.6923076923076923
precision_score --  0.0
recall_score --  0.0
f1_score --  0.0
---------------Testing Result SVM--------------- 
 

[[21  0]
 [ 6  0]]
accuracy --  0.7777777777777778
precision_score --  0.0
recall_score --  0.0
f1_score --  0.0


[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:    0.5s finished
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Work Done -- 3

### Feature Extraction

In [39]:
train_features, test_features = feature_extraction_1(X_train, X_test)

### Training

In [52]:
train(train_features, test_features,  y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  1.6min finished


best rf features -- 
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


---------------Training Result Random Forest--------------- 
 

[[27  0]
 [ 9  3]]
accuracy --  0.7692307692307693
precision_score --  1.0
recall_score --  0.25
f1_score --  0.4
classification report --- 
               precision    recall  f1-score   support

           0       0.75      1.00      0.86        27
           1       1.00      0.25      0.40        12

   micro avg       0.77      0.77      0.77        39
   macro avg       0.88      0.62      0.63        39
weighted avg       0.83      0.77      0.72        39

---------------Testin

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


best svm features -- 
 LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=200,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


---------------Training Result SVM--------------- 
 

[[27  0]
 [ 6  6]]
accuracy --  0.8461538461538461
precision_score --  1.0
recall_score --  0.5
f1_score --  0.6666666666666666
classification report --- 
               precision    recall  f1-score   support

           0       0.82      1.00      0.90        27
           1       1.00      0.50      0.67        12

   micro avg       0.85      0.85      0.85        39
   macro avg       0.91      0.75      0.78        39
weighted avg       0.87      0.85      0.83        39

classification report --- 
               precision    recall  f1-score   support

           0       0.76      0.90      0.83        21
           1       0.00      0.00      0.00         6

   micro avg       0.70      0.70 

[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:    0.8s finished


#### Xgboost

In [42]:
from xgboost import XGBClassifier
xg_clf = XGBClassifier()
xg_clf.fit(train_features,y_train)

  data = yaml.load(f.read()) or {}
  import pandas.util.testing as tm
  defaults = yaml.load(f)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [50]:
## prediction

prediction_xg_train = xg_clf.predict(train_features)

print("---------------Training Result Xgboost--------------- \n \n")
print(confusion_matrix(y_train,prediction_xg_train))
print("accuracy -- " , accuracy_score(y_train,prediction_xg_train))
print("precision_score -- " , precision_score(y_train,prediction_xg_train))
print("recall_score -- " , recall_score(y_train,prediction_xg_train))
print("f1_score -- " , f1_score(y_train,prediction_xg_train))
print("classification report --- \n",classification_report(y_train,prediction_xg_train))

prediction_xg_test = xg_clf.predict(test_features)

print("---------------Testing Result Xgboost--------------- \n \n")
print(confusion_matrix(y_test,prediction_xg_test))
print("accuracy -- " , accuracy_score(y_test,prediction_xg_test))
print("precision_score -- " , precision_score(y_test,prediction_xg_test))
print("recall_score -- " , recall_score(y_test,prediction_xg_test))
print("f1_score -- " , f1_score(y_test,prediction_xg_test))
print("classification report --- \n",classification_report(y_test,prediction_xg_test))

---------------Training Result Xgboost--------------- 
 

[[27  0]
 [12  0]]
accuracy --  0.6923076923076923
precision_score --  0.0
recall_score --  0.0
f1_score --  0.0
classification report --- 
               precision    recall  f1-score   support

           0       0.69      1.00      0.82        27
           1       0.00      0.00      0.00        12

   micro avg       0.69      0.69      0.69        39
   macro avg       0.35      0.50      0.41        39
weighted avg       0.48      0.69      0.57        39

---------------Testing Result Xgboost--------------- 
 

[[21  0]
 [ 6  0]]
accuracy --  0.7777777777777778
precision_score --  0.0
recall_score --  0.0
f1_score --  0.0
classification report --- 
               precision    recall  f1-score   support

           0       0.78      1.00      0.88        21
           1       0.00      0.00      0.00         6

   micro avg       0.78      0.78      0.78        27
   macro avg       0.39      0.50      0.44        27
weig

## Work Done -- 4

### Preprocessing

In [83]:
data_copy["combined"] = Preprocessing_lemmatization_spacy(data_copy, "combined")

In [84]:
data_copy.head()

Unnamed: 0,Tower Name,Application Name,Email Subject,Flag,combined
0,Core Pharmacy,PharmRDS-Accredo RealTime Messages,Production - Accredo Real Time Order Process S...,Actionable,core pharmacy pharmrd accredo realtime message...
1,Core Pharmacy,IVR - Web Service,Splunk Alert: SO IVR - Pharmacy Migration Coor...,Actionable,core pharmacy ivr web service splunk alert ivr...
2,Core Pharmacy,ISAM- Web Service,Splunk Alert: ISAM - Slowness,Actionable,core pharmacy isam web service splunk alert is...
3,Core Pharmacy,Migration Co-Ordinaor,Splunk Alert: Migration_Coordinator_500 Clone,Actionable,core pharmacy migration ordinaor splunk alert ...
4,Core Pharmacy,Claim Realm - Web Service,Splunk Alert: Claim Realm Service - Slowness A...,Actionable,core pharmacy claim realm web service splunk a...


### Train Test Split

In [85]:
from sklearn.model_selection import train_test_split
X = data_copy["combined"]
y = data_copy["Flag"].map({"Actionable" : 0, "Non Actionable" : 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

### Feature Extraction

In [89]:
train_features, test_features = feature_extraction_1(X_train, X_test)

In [91]:
train_features.columns

AttributeError: columns not found

### Training

In [90]:
train(train_features, test_features,  y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   55.6s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  1.6min finished


best rf features -- 
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


---------------Training Result Random Forest--------------- 
 

[[26  1]
 [ 9  3]]
accuracy --  0.7435897435897436
precision_score --  0.75
recall_score --  0.25
f1_score --  0.375
classification report --- 
               precision    recall  f1-score   support

           0       0.74      0.96      0.84        27
           1       0.75      0.25      0.38        12

   micro avg       0.74      0.74      0.74        39
   macro avg       0.75      0.61      0.61        39
weighted avg       0.75      0.74      0.70        39

---------------Tes

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


best svm features -- 
 LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=200,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


---------------Training Result SVM--------------- 
 

[[27  0]
 [12  0]]
accuracy --  0.6923076923076923
precision_score --  0.0
recall_score --  0.0
f1_score --  0.0
classification report --- 
               precision    recall  f1-score   support

           0       0.69      1.00      0.82        27
           1       0.00      0.00      0.00        12

   micro avg       0.69      0.69      0.69        39
   macro avg       0.35      0.50      0.41        39
weighted avg       0.48      0.69      0.57        39

classification report --- 
               precision    recall  f1-score   support

           0       0.75      0.86      0.80        21
           1       0.00      0.00      0.00         6

   micro avg       0.67      0.67      0.67  

[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:    0.5s finished
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Feature extraction before split

## Work Done - 1

### Preprocessing

In [92]:
data_copy["combined"] = Preprocessing_lemmatization_spacy(data_copy, "combined")
data_copy.head()

Unnamed: 0,Tower Name,Application Name,Email Subject,Flag,combined
0,Core Pharmacy,PharmRDS-Accredo RealTime Messages,Production - Accredo Real Time Order Process S...,Actionable,core pharmacy pharmrd accredo realtime message...
1,Core Pharmacy,IVR - Web Service,Splunk Alert: SO IVR - Pharmacy Migration Coor...,Actionable,core pharmacy ivr web service splunk alert ivr...
2,Core Pharmacy,ISAM- Web Service,Splunk Alert: ISAM - Slowness,Actionable,core pharmacy isam web service splunk alert is...
3,Core Pharmacy,Migration Co-Ordinaor,Splunk Alert: Migration_Coordinator_500 Clone,Actionable,core pharmacy migration ordinaor splunk alert ...
4,Core Pharmacy,Claim Realm - Web Service,Splunk Alert: Claim Realm Service - Slowness A...,Actionable,core pharmacy claim realm web service splunk a...


In [93]:
X = data_copy["combined"]
y = data_copy["Flag"].map({"Actionable" : 0, "Non Actionable" : 1})

### Feature Extraction

In [110]:
train_features,_ = feature_extraction_3(X)

Features -- 


### Train Test Split

In [111]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_features, y, test_size=0.40, random_state=42)

### Training

In [112]:
train(X_train, X_test,  y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  1.6min finished


best rf features -- 
 RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=2, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False)


---------------Training Result Random Forest--------------- 
 

[[27  0]
 [12  0]]
accuracy --  0.6923076923076923
precision_score --  0.0
recall_score --  0.0
f1_score --  0.0
classification report --- 
               precision    recall  f1-score   support

           0       0.69      1.00      0.82        27
           1       0.00      0.00      0.00        12

   micro avg       0.69      0.69      0.69        39
   macro avg       0.35      0.50      0.41        39
weighted avg       0.48      0.69      0.57        39

---------------T

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


best svm features -- 
 LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=200,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


---------------Training Result SVM--------------- 
 

[[27  0]
 [12  0]]
accuracy --  0.6923076923076923
precision_score --  0.0
recall_score --  0.0
f1_score --  0.0
classification report --- 
               precision    recall  f1-score   support

           0       0.69      1.00      0.82        27
           1       0.00      0.00      0.00        12

   micro avg       0.69      0.69      0.69        39
   macro avg       0.35      0.50      0.41        39
weighted avg       0.48      0.69      0.57        39

classification report --- 
               precision    recall  f1-score   support

           0       0.78      1.00      0.88        21
           1       0.00      0.00      0.00         6

   micro avg       0.78      0.78      0.78  

[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:    1.0s finished
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
