In [70]:
import re
import spacy
from wordcloud import WordCloud, STOPWORDS 
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier
import datetime
import nltk
from nltk import PorterStemmer
nltk.download("wordnet")
from nltk import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chayan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [71]:
re.__version__

'2.2.1'

In [107]:
def Preprocessing_lemmatization(df,column_name):
    
    # Email Subject column
    df[column_name] = df[column_name].apply(lambda x: re.sub('[^A-Za-z]'," ",x))
    _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
    df[column_name] = df[column_name].apply(lambda x: _RE_COMBINE_WHITESPACE.sub(" ",x).strip())
    stopwords = set(STOPWORDS) 
    lemmatizer = WordNetLemmatizer()
    df[column_name] = df[column_name].apply(lambda x : x.lower())
    df[column_name] = df[column_name].apply(lambda x : " ".join(word for word in x.split() if len(word) > 2))
    df[column_name] = df[column_name].apply(lambda x : " ".join([word for word in x.split() if word not in stopwords]))
    df[column_name] = df[column_name].apply(lambda x : " ".join(lemmatizer.lemmatize(token) for token in x.split())) 
    return df[column_name]

In [88]:
def Preprocessing_lemmatization_spacy(df,column_name):
    
    # Email Subject column
    df[column_name] = df[column_name].apply(lambda x: re.sub('[^A-Za-z]'," ",x))
    _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
    df[column_name] = df[column_name].apply(lambda x: _RE_COMBINE_WHITESPACE.sub(" ",x).strip())
    stopwords = set(STOPWORDS) 
    nlp = spacy.load("en_core_web_sm")
    df[column_name] = df[column_name].apply(lambda x : x.lower())
    df[column_name] = df[column_name].apply(lambda x : " ".join(word for word in x.split() if len(word) > 2))
    df[column_name] = df[column_name].apply(lambda x : " ".join([word for word in x.split() if word not in stopwords]))
    df[column_name] = df[column_name].apply(lambda x : " ".join(token.lemma_ for token in nlp(x))) 
    return df[column_name]

In [None]:
def training(path,validation_split_ratio,tf_ngram_range,tf_max_df,tf_min_df,tf_max_features,rf_max_depth,rf_n_estimators,rf_max_features = "auto",rf_min_samples_split = 2,rf_min_samples_leaf = 1):
    
    # Reading
    try:
        data = pd.read_excel(path) 
    except:
        data = pd.read_csv(path) 
    
    data.head()

    data.isnull().sum()
    

    # ger rid of na values in Flag
    data = data[~data['Flag'].isna()]
    print(data.shape)
    data.head()

    data_copy = data.copy()
    data_copy['combined'] = data_copy.apply(lambda x : f"{x['Tower Name']} {x['Application Name']} {x['Email Subject']}",axis = 1)
    data_copy["combined"] = Preprocessing_lemmatization(data_copy, "combined")
    X = data_copy["combined"]
    y = data_copy["Flag"].map({"Actionable" : 0, "Non Actionable" : 1})

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= validation_split_ratio, random_state=42)
    
                                      
    vectorizer = TfidfVectorizer(use_idf = True, ngram_range = tf_ngram_range,max_df = tf_max_df,min_df = tf_min_df,max_features = tf_max_features,norm= 'l2')
    train_data = vectorizer.fit_transform(X_train)
    test_data = vectorizer.transform(X_test)
    print(train_data.shape)
    print(test_data.shape)                                        

    
    # Dump the file
    pickle.dump(vectorizer, open(f"tfidf{datetime.datetime.now().time().strftime('%H_%M_%S')}.pkl", "wb"))
                                 
                                 
    rf_model = RandomForestClassifier(max_depth = rf_max_depth,
                                     max_features = rf_max_features,
                                     n_estimators = rf_n_estimators,
                                     min_samples_split = rf_min_samples_split,
                                     min_samples_leaf = rf_min_samples_leaf,
                                     random_state = 10)
    rf_model.fit(train_data,y_train)

    with open(f"random_forest_tfidf{datetime.datetime.now().time().strftime('%H_%M_%S')}.pkl", 'wb') as f:
        pickle.dump(rf_model, f)

    ## prediction
    print(train_data.shape)
    prediction_rf_train = rf_model.predict(train_data)

    print("---------------Training Result random forest--------------- \n \n")
    print(confusion_matrix(y_train,prediction_rf_train))
    print("accuracy -- " , accuracy_score(y_train,prediction_rf_train))
    print("precision_score -- " , precision_score(y_train,prediction_rf_train))
    print("recall_score -- " , recall_score(y_train,prediction_rf_train))
    print("f1_score -- " , f1_score(y_train,prediction_rf_train))
    print("classification report --- \n",classification_report(y_train,prediction_rf_train))
    
    print(test_data.shape)
              
    prediction_rf_test = rf_model.predict(test_data)

    print("---------------Testing Result random forest--------------- \n \n")
    print(confusion_matrix(y_test,prediction_rf_test))
    print("accuracy -- " , accuracy_score(y_test,prediction_rf_test))
    print("precision_score -- " , precision_score(y_test,prediction_rf_test))
    print("recall_score -- " , recall_score(y_test,prediction_rf_test))
    print("f1_score -- " , f1_score(y_test,prediction_rf_test))
    print("classification report --- \n",classification_report(y_test,prediction_rf_test))

In [106]:
training("Email_Classificaion.xlsx",
         validation_split_ratio = .4,
         tf_ngram_range = (1,2),
         tf_max_df = 20,
         tf_min_df = 3,
         tf_max_features = 100,
         rf_max_depth = 5,
         rf_n_estimators = 150,
         rf_max_features = None,
         rf_min_samples_split = 3,
         rf_min_samples_leaf = 2
        )

(66, 4)
(39, 50)
(27, 50)
(39, 50)
---------------Training Result random forest--------------- 
 

[[25  2]
 [ 1 11]]
accuracy --  0.9230769230769231
precision_score --  0.8461538461538461
recall_score --  0.9166666666666666
f1_score --  0.8799999999999999
classification report --- 
               precision    recall  f1-score   support

           0       0.96      0.93      0.94        27
           1       0.85      0.92      0.88        12

   micro avg       0.92      0.92      0.92        39
   macro avg       0.90      0.92      0.91        39
weighted avg       0.93      0.92      0.92        39

(27, 50)
---------------Testing Result random forest--------------- 
 

[[19  2]
 [ 2  4]]
accuracy --  0.8518518518518519
precision_score --  0.6666666666666666
recall_score --  0.6666666666666666
f1_score --  0.6666666666666666
classification report --- 
               precision    recall  f1-score   support

           0       0.90      0.90      0.90        21
           1       0.

## Testing

In [61]:
def prediction_single_observation(tower_name,application_name,email_subject,tf_extractor_path,rf_model_path):
    
    # input
    tower_nm = str(tower_name)
    app_nm = str(application_name)
    email_sb = str(email_subject)
    email_bdy = ""
    
    #preprocessing
    combined = tower_nm + " " + app_nm + " " + email_sb + " " + email_bdy
    
    combined = re.sub('[^A-Za-z]'," ",combined)
    _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
    combined = _RE_COMBINE_WHITESPACE.sub(" ", combined).strip()
    stopwords = set(STOPWORDS) 
    nlp = spacy.load("en_core_web_sm")
    combined = combined.lower()
    combined = " ".join(word for word in combined.split() if len(word) > 2)
    combined = " ".join([word for word in combined.split() if word not in stopwords])
    combined = " ".join(token.lemma_ for token in nlp(combined))
    
    # Feature Extraction
    # load the feature extarctor
    tf_idf = pickle.load(open(tf_extractor_path, 'rb'))
    
    combined = pd.Series(combined)
    test_data = tf_idf.transform(combined)
    
    # load the model from disk
    loaded_model = pickle.load(open(rf_model_path, 'rb'))
    result = loaded_model.predict(test_data)
    
    if result[0] == 0:
        print("Actionable")
        
    else:
        print("Non Actionable")

        
        
    

def prediction_data(path,tf_extractor_path,rf_model_path):
    
    # Reading
    try:
        data = pd.read_excel(path) 
    except:
        data = pd.read_csv(path) 
    
    
    #preprocessing
    data['combined'] = data.apply(lambda x : f"{x[0]} {x[1]} {x[2]}",axis = 1)
    data["combined"] = Preprocessing_lemmatization_spacy(data, "combined")
    
    
    # Feature Extraction
    # load the feature extarctor
    tf_idf = pickle.load(open(tf_extractor_path, 'rb'))
    test_data = tf_idf.transform(data["combined"])
    
    # load the model 
    loaded_model = pickle.load(open(rf_model_path, 'rb'))
    result = loaded_model.predict(test_data)
    #print(result)
    
    #concat prediction
    data["Flag"] = result
    data["Flag"] = data["Flag"].map({0 : "Actionable" , 1 : "Non Actionable"})
    
    data.to_csv("result_data.csv")
    print("check data in your current directory")

## Checking

In [62]:
prediction_single_observation('Core Pharmacy' ,'PharmRDS-Accredo RealTime Messages','error alert please solve','tfidf22_43_58.pkl','random_forest_tfidf22_43_58.pkl')

Actionable


In [63]:
prediction_data("Book1.xlsx",'tfidf22_43_58.pkl','random_forest_tfidf22_43_58.pkl')

check data in your current directory
