In this notebook, I will be scraping some posts from Twitter in order to test my highest-functioning model on new, real data. Once this is tested and confirmed, I can confidently deploy my cyberbullying detection app.

In [5]:
# importing necessary libraries and packages
import pandas as pd
import numpy as np

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import svm

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score, confusion_matrix, precision_recall_curve, plot_confusion_matrix, auc

import pickle

In [6]:
# regular expression to identify non-ascii characters in content
non_ascii_regex = r'[^\x00-\x7F]+'

# defining stop words
stopword_list = stopwords.words('english')
stopword_list += list(string.punctuation)

# function to remove special characters, tokenize, and stem content
def process(content):
    
    # using library re to replace non ascii characters by a space
    text = re.sub(non_ascii_regex, ' ', content)

    # instantiating TweetTokenizer
    tk = TweetTokenizer(strip_handles=True)
    # tokenizing the content & removing usernames
    tokens = tk.tokenize(content)
    
    # instantiating stemmer
    ps = PorterStemmer()

    # stemming the tokens and removing the stopwords
    clean_tokens = []
    for token in tokens:
        if token not in stopword_list:
            try:
                clean_tokens.append(ps.stem(token.lower()))
            except:
                pass
        
    # return the tokens
    return clean_tokens

In [9]:
# loading model
with open('pickles/lr_pipeline.pkl', 'rb') as file:
    model = pickle.load(file)
model.predict(['i hate you'])

array([1.])

In [None]:
def preprocess(data):

    # lemmatize
    def lemmadata(doc):
        pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
        raw_tokens = nltk.regexp_tokenize(doc, pattern)
        tokens = [i.lower() for i in raw_tokens]
        stop_words = set(stopwords.words('english'))
        listed = [w for w in tokens if not w in stop_words]
        lemmatized = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in listed]
        lemmatized = list(filter(lambda w: w != 'lb', lemmatized))
        words = list(filter(lambda w: w in english, lemmatized))
        return " ".join(words)

    lemmatized = [lemmadata(post) for post in data]

    # picked tfidf vectorizer
    tfidf = pickle.load(open("pickles/tfidf.pkl", "rb"))

    transformed = tfidf.transform(lemmatized)
    tfidf_df = pd.DataFrame(transformed.toarray(), columns=tfidf.get_feature_names())

    # pickled the list of relevant words
    relevant = pickle.load(open("pickles/relevantwords.pkl", "rb"))

    testset = [tfidf_df[word] for word in relevant if word in tfidf_df.columns]

    return pd.DataFrame(testset).transpose()


In [None]:
def classify_text(text):
    # the model
    mnb = pickle.load(open("pickles/lr_pipeline.pkl", "rb"))
    listtext = [text]
    processed = preprocess(listtext)
    result = mnb.predict(processed)[0]

    return result