# Natural Language Processing

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import wordcloud
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')

In [None]:
textsample = """
We are all agreeing with the cat on this when she says that a large amount of fluff is indeed 
helpful in almost all of life's situations. Do you agree, too?
"""

### Basic Cleanup

In [None]:
def clean_text(text):
    # remove special signs
    text_clean = re.sub("[^a-zA-Z0-9 ]", "", text)
    # remove double whitespace
    text_clean = re.sub("  ", " ", text_clean)
    # convert to lowercase
    text_clean = text_clean.lower()
    return text_clean

textsample_clean = clean_text(textsample)

### Tokenization

In [None]:
from nltk.tokenize import word_tokenize
textsample_tokenize = word_tokenize(textsample_clean)
print(textsample_tokenize)

### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
textsample_lemmatize = [lemmatizer.lemmatize(item) for item in textsample_tokenize]
print(textsample_lemmatize)

After lemmatization, we will be getting a valid word with the same meaning.

### Stemming

In [None]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

In [None]:
textsample_stem = [stemmer.stem(item) for item in textsample_lemmatize]
print(textsample_stem)

Stemming extracts the base form of words - after stemming, we don't necessarily get meaningful words anymore.

### Removing Stopwords

In [None]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words("english")

textsample_nostopwords = [item for item in textsample_stem if not item in stopwords_list]
print(textsample_nostopwords)

In [None]:
print(stopwords_list)

### Vectorizing Text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [None]:
vectorizer.fit_transform(textsample_nostopwords).toarray()

## Applying to real data: IMDB movie reviews

Get the data from here: http://ai.stanford.edu/~amaas/data/sentiment/


In [None]:
df = pd.read_csv('imdb_sentiment.csv')

## Preparing the data

In [None]:
df['target'].value_counts()

In [None]:
df['review_clean'] = df['review'].apply(lambda row: re.sub("[^A-Za-z0-9 ]",'', row.lower().strip()))
df.head()

## Tokenization

In [None]:
df['review_tokenize'] = df['review_clean'].apply(word_tokenize)
df.head()

## Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()

df['review_lemmatize'] = df['review_tokenize'].apply(lambda row: [lemmatizer.lemmatize(item) for item in row])
df.head()

## Stemming

In [None]:
stemmer = SnowballStemmer("english")

df['review_stem'] = df['review_lemmatize'].apply(lambda row: [stemmer.stem(item) for item in row])
df.head()

## Removing Stopwords

In [None]:
stopwords_list = stopwords.words("english")

df['review_nostopwords'] = df['review_stem'].apply(lambda row: [item for item in row if not item in stopwords_list])
df.head()

## Vectorizing Text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=lambda x: x)

In [None]:
X = vectorizer.fit_transform(df['review_nostopwords']).toarray()
X = pd.DataFrame(X, columns=vectorizer.get_feature_names())

## Splitting into train and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df['target'], test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
pred = clf.predict(X_test)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, pred))
print(recall_score(y_test, pred))
print(f1_score(y_test, pred))