In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import os
os.chdir('/content/drive/My Drive/ML Project/project')
!ls

dataset			   LR_training.ipynb   MNB.ipynb
Dataset_Exploration.ipynb  LSTM_train.ipynb    models
dataset_preprocess.ipynb   ML_Pipelines.ipynb  outputs


In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import nltk

In [0]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Loading Machine Learning Model

In [0]:
import pickle
model_path = "models/LR_model_final.sav"
model = pickle.load(open(model_path, 'rb'))

In [0]:
import pickle
tfidf_vocab = pickle.load(open('models/vocabulary_final.pickle', 'rb'))

In [0]:
tfidf_vocab

{'like': 4939,
 'tribe': 8967,
 'fan': 3224,
 'idiot': 4212,
 'woman': 9785,
 'war': 9463,
 'see': 7709,
 'coming': 1688,
 'xa': 9939,
 'need': 5664,
 'protection': 6898,
 'ignorant': 4218,
 'rhetoric': 7467,
 'representing': 7332,
 'best': 1010,
 'interest': 4475,
 'wow': 9897,
 'many': 5242,
 'win': 9747,
 'year': 9962,
 'bat': 946,
 'nice': 5756,
 'stupid': 8347,
 'haha': 3901,
 'green': 3842,
 'red': 7176,
 'loser': 5112,
 'winning': 9753,
 'moron': 5535,
 'god': 3758,
 'hate': 3955,
 'faggot': 3180,
 'difference': 2515,
 'fag': 3175,
 'fart': 3239,
 'put': 6966,
 'meat': 5353,
 'oh': 5978,
 'go': 3741,
 'kiss': 4727,
 'as': 709,
 'goat': 3757,
 'insult': 4458,
 'veteran': 9353,
 'even': 3006,
 'coward': 2065,
 'incompetent': 4350,
 'chief': 1500,
 'everyone': 3029,
 'enemy': 2911,
 'fall': 3211,
 'hard': 3940,
 'boy': 1193,
 'chance': 1435,
 'kid': 4705,
 'wrong': 9926,
 'real': 7116,
 'shit': 7866,
 'fuck': 3551,
 'live': 5047,
 'ok': 5990,
 'hell': 3998,
 'released': 7256,
 'cop

In [0]:
tf1_new = TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=10000, stop_words = "english", lowercase = True
                          , vocabulary = tfidf_vocab)

In [0]:
# Loading Tfidf Vector from saved models
# https://www.kaggle.com/mattwills8/fit-transform-and-save-tfidfvectorizer
# https://stackoverflow.com/questions/29788047/keep-tfidf-result-for-predicting-new-content-using-scikit-for-python


from sklearn.feature_extraction.text import TfidfVectorizer

import pickle
tfidf_path = "models/tfidf_final.pickle"
tfidf_vec = pickle.load(open(tfidf_path, 'rb'))

# tf1_new = TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words = "english", lowercase = True
                        #   , vocabulary = tfidf_vec.vocabulary_)

In [0]:
from sklearn.base import TransformerMixin, BaseEstimator
import re
import nltk 
from nltk import WordNetLemmatizer
import multiprocessing as mp


class TextPreprocessor(BaseEstimator, TransformerMixin):

  def __init__(self, variety = 'BrE', user_abbrevs={}, n_jobs=1):
    self.variety = variety
    self.user_abbrevs = user_abbrevs
    self.n_jobs = n_jobs
  
  def fit(self, X, y = None):
    return self
  
  def transform(self, X, *_):
    # incomplete will be completed later
    # X_copy = X.copy()
    X_copy = X
    partitions = 1
    cores = mp.cpu_count()

    if self.n_jobs <= -1:
      partition = cores
    elif self.n_jobs <= 0:
      return X_copy.apply(slef._preprocess_text)
    else:
      partitions = min(self.n_jobs, cores)
    
    data_split = np.array_split(X_copy,partitions)
    pool = mp.Pool(cores)
    data = pd.concat(pool.map(self._preprocess_part, data_split))
    pool.close()
    pool.join()

    return data
  
  def _preprocess_part(self, part):
    return part.apply(self._preprocess_text)
  
  def _preprocess_text(self, text):
    hashtags = re.findall('#\w*',text)
    users = re.findall('@\w*',text)
    links = re.findall('(https|http)?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', text, flags=re.MULTILINE)
    links = list(set(links))
    # Removing Links
    text =  re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', ' ', text, flags=re.MULTILINE)
    # Removing mentions and hashtags
    text = re.sub(r"#(\w+)", ' ', text, flags=re.MULTILINE)
    text = re.sub(r"@(\w+)", ' ', text, flags=re.MULTILINE)
    # Removing punctuations
    text = re.sub(r'[^\w\d\s]', ' ', text)
    # convert to lower case
    text = re.sub(r'^\s+|\s+?$', ' ', text.lower())
    # Removing digits
    text = re.sub(r'\d', ' ', text)
    # Removing other symbols
    text = re.sub('[ãâªð³ÂÃÃ±¤¡¥¶¦§_®¯¹¾²µ½¼º]+', ' ', text)
    # collapse all white spaces
    text = re.sub(r'\s+', ' ', text)
    # text = re.sub('[Ã]', ' ', str(text))
    # remove stop words and perform stemming
    stop_words = nltk.corpus.stopwords.words('english')
    # 
    lemmatizer = WordNetLemmatizer()
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words)

    return [hashtags, users, links, [text]]

  

  def predict(self, text):
    tfidf_text = tfidf_vec.transform(text)
    print(model.predict(tfidf_text))
    return 


In [0]:
text = "you r a nice person. I am very grateful to you"
# text = pd.Series(text)
# print(text)
processing = TextPreprocessor()

In [0]:
output = processing._preprocess_text(text)
output

[[], [], [], ['r nice person grateful']]

In [0]:
tfidf_text = tfidf_vec.transform([text])

In [0]:
tfidf_text = tf1_new.fit_transform([text])

In [0]:
print(tfidf_text)

  (0, 6421)	0.5773502691896258
  (0, 5756)	0.5773502691896258
  (0, 3826)	0.5773502691896258


In [0]:
x = processing.predict(output[-1])
x

[0]


Checking the difference

In [0]:
df = pd.read_csv('outputs/processed_wos.csv')
df.head()

Unnamed: 0,text,label,hashtags,users,links,processed_text
0,"""like this if you are a tribe fan""",0,[],[],[],like tribe fan
1,"""you're idiot.......................""",1,[],[],[],idiot
2,"""I am a woman Babs, and the only ""war on women...",1,[],[],[],woman babs war woman see coming jackazzes like...
3,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F...",1,[],[],[],wow benefitted many win year bat nice stupid
4,"""haha green me red you now loser whos winning ...",1,[],[],[],haha green red loser who winning moron


In [0]:
X = df['processed_text']
X

0                                            like tribe fan
1                                                     idiot
2         woman babs war woman see coming jackazzes like...
3              wow benefitted many win year bat nice stupid
4                    haha green red loser who winning moron
                                ...                        
196383    second time asking view completely contradicts...
196384                 ashamed horrible thing put talk page
196385    spitzer umm there actual article prostitution ...
196386    look like actually put speedy first version de...
196387    really think understand came idea bad right aw...
Name: processed_text, Length: 196388, dtype: object

In [0]:
import re
def preprocess(text):
    text = re.sub(r'[^\w\d\s]', ' ', str(text))
    text = re.sub('[Ã]', ' ', str(text))
    return ' '.join(term for term in text.split())
df['processed_text'] = df.processed_text.apply(lambda row : preprocess(row))

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
tfidf_text =tfidf_vec.transform(df.processed_text)

In [0]:
predictions = model.predict(tfidf_text).tolist()
# predictions

In [0]:
y_test = df['label'].values

In [0]:
from sklearn.metrics import classification_report
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

           0       0.98      0.95      0.97    160079
           1       0.82      0.93      0.87     36309

    accuracy                           0.95    196388
   macro avg       0.90      0.94      0.92    196388
weighted avg       0.95      0.95      0.95    196388

