# Jupyter Notebook Playground

In [None]:
print("hello world!")

In [None]:
my_variable = 101

In [None]:
print(my_variable)

# Text Classification Tutorial 
Credit to: https://stackabuse.com/text-classification-with-python-and-scikit-learn/

In [None]:
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
import pickle
from nltk.corpus import stopwords

In [None]:
movie_data = load_files(r"review_polarity/txt_sentoken")
X, y = movie_data.data, movie_data.target

## Preprocessing The Data

In [None]:
documents = []

from nltk.stem import WordNetLemmatizer


stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

## Feature Engineering

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X1 = vectorizer.fit_transform(documents).toarray()

CountVectorizer makes very unique word a feature for the ml model. Parameters:

- We take a max of 1500 features (the most common, highest occuring 1500 unique words)
- min_df is number of documents they appear in at minimum
- max_df is the maximum percentage of documents containing this word
- Finally, stop_words removes any very commons words in the english language

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X2 = tfidfconverter.fit_transform(X1).toarray()

TfidfTransformer turns the count array earlier into an array of weights for each unique word based on the document:
- TF-IDF weight is a weight often used in information retrieval and text mining
- This weight is a statistical measure used to evaluate how important a word is to a document in a collection or corpus
- The importance increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus
- Variations of the tf-idf weighting scheme are often used by search engines as a central tool in scoring and ranking a document's relevance given a user query

## Splitting The Data (Train-Test Split)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=0)

## Training The Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

In [None]:
y_pred = classifier.predict(X_test)

Random Forest example:

<img align="left" src=https://miro.medium.com/max/900/1*EFBVZvHEIoMdYHjvAZg8Zg.gif width="450" /> 
<img align="left" src=https://static.javatpoint.com/tutorial/machine-learning/images/random-forest-algorithm2.png width="450" />


Pros of random forest:
- Great predictive performance for binary classification
- They provide a reliable feature importance estimate
- They offer efficient estimates of the test error without incurring the cost of repeated model training associated with cross-validation
- Handles thousands of input variables without variable deletion

Cons of random forest:
- An ensemble model is inherently less interpretable than an individual decision tree
- Training a large number of deep trees can have high computational costs (but can be parallelized) and use a lot of memory
- Predictions are slower, which may create challenges for applications

More considerations: 
- https://github.com/TayariAmine/ML_cheat_sheet/wiki/Random-forest-Pros-and-Cons
- https://www.oreilly.com/library/view/hands-on-machine-learning/9781789346411/e17de38e-421e-4577-afc3-efdd4e02a468.xhtml

## Evaluating The Model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("CONFUSION MATRIX: \n{}".format(confusion_matrix(y_test,y_pred)))
print("\nACCURACY: {}".format(accuracy_score(y_test, y_pred)))

<img src=https://glassboxmedicine.files.wordpress.com/2019/02/confusion-matrix.png width="500">

In [None]:
print(classification_report(y_test,y_pred))

## Saving The Model

In [None]:
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

In [None]:
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [None]:
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2)) 

## Try It Yourself!

In [None]:
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
import pickle
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer

movie_data = load_files(r"review_polarity/txt_sentoken")
X, y = movie_data.data, movie_data.target

# Preprocessing functions
def preprocessing(input_X):

    stemmer = WordNetLemmatizer()
    documents = []

    for sen in range(0, len(input_X)):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(input_X[sen]))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        document = document.split()

        document = [stemmer.lemmatize(word) for word in document]
        document = ' '.join(document)

        documents.append(document)

    vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
    X1 = vectorizer.fit_transform(documents).toarray()

    tfidfconverter = TfidfTransformer()
    X2 = tfidfconverter.fit_transform(X1).toarray()
    
    return X2, documents

with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

Change the sentences in the 'new_X_test' array below!

In [None]:
new_X_test = ['I loved this movie so much!', 'This movie was bad and had terrible actors']
count = len(new_X_test)
new_X_test.extend(X)

new_X2, new_documents = preprocessing(new_X_test)

In [None]:
for i in range(count):
    review = new_documents[i]
    prediction = model.predict([new_X2[i]])
    print("review: {}, pred: {}".format(review, prediction))
    