# NLTK Tutorial using Scikit Learn

## Gender Identification

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from nltk.corpus import names
import nltk
import random

In [3]:
#create a list of tupples of name and gender
labeled_names = ([(name,'male') for name in names.words('male.txt')]+
                [(name,'female') for name in names.words('female.txt')])

In [4]:
#shuffle the above collection of name-gender pairs
random.shuffle(labeled_names)

In [5]:
#break it down to individual parts
names = [name for name,gender in labeled_names]
genders = [gender for name,gender in labeled_names]

In [6]:
#create a dataframe with the above information
df = pd.DataFrame()
df['Name'] = names
df['Gender'] = genders

In [7]:
#lets look at how our dataframe looks like
df.head()

Unnamed: 0,Name,Gender
0,Georgiana,female
1,Hanny,female
2,Gerti,female
3,Dacie,female
4,Hersh,male


In [8]:
#lets create a copy to modify as we go on
df1 = df.copy()

In [9]:
#create a new column using the name's last letter/character
df1['Last_Letter'] = df1['Name'].apply(lambda x:x[-1])

## Splitting the data 

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X = df1['Last_Letter']  #we will use the name's last character to train our model
y = df1['Gender']       #get the gender as the label to train our model

In [12]:
#since our models only except numerical values, let us turn our data into 
#one-hot-encoded variables
X = pd.get_dummies(X,drop_first=True,prefix='Last_Letter')

In [13]:
X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.33,random_state=None)

In [14]:
X.head()

Unnamed: 0,Last_Letter_a,Last_Letter_b,Last_Letter_c,Last_Letter_d,Last_Letter_e,Last_Letter_f,Last_Letter_g,Last_Letter_h,Last_Letter_i,Last_Letter_j,...,Last_Letter_p,Last_Letter_r,Last_Letter_s,Last_Letter_t,Last_Letter_u,Last_Letter_v,Last_Letter_w,Last_Letter_x,Last_Letter_y,Last_Letter_z
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Fitting the model

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
clf = MultinomialNB()

In [17]:
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
clf.score(X_test,y_test)

0.7585812356979404

## Adding more Features

In [19]:
df1['First_Letter'] = df1['Name'].apply(lambda x:x[0].lower())

In [20]:
df1.tail()

Unnamed: 0,Name,Gender,Last_Letter,First_Letter
7939,Anatol,male,l,a
7940,Kettie,female,e,k
7941,Ema,female,a,e
7942,Marga,female,a,m
7943,Bessie,female,e,b


In [21]:
x1 = pd.get_dummies(df1['First_Letter'],prefix='FL',drop_first=True)
x2 = pd.get_dummies(df1['Last_Letter'],prefix='LL',drop_first=True)

In [22]:
X1 = pd.concat([x1,x2],axis=1)

In [23]:
for letter in 'abcdefghijklmnopqrstuvwxyz':
    X1[letter+'Count'] = df1.Name.str.count(letter)

In [24]:
X1.head()

Unnamed: 0,FL_b,FL_c,FL_d,FL_e,FL_f,FL_g,FL_h,FL_i,FL_j,FL_k,...,qCount,rCount,sCount,tCount,uCount,vCount,wCount,xCount,yCount,zCount
0,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,1,1,0,0,0,0,0,0,0


In [25]:
X_train,X_test,y_train,y_test = train_test_split(
    X1,y,test_size=0.33,random_state=None)

In [26]:
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [27]:
clf.score(X_test,y_test)

0.7913806254767353

### Using last two and last one letter(s) 

In [28]:
df1['Last_two_Letter'] = df1['Name'].apply(lambda x:x[-2:])

In [29]:
df1.head()

Unnamed: 0,Name,Gender,Last_Letter,First_Letter,Last_two_Letter
0,Georgiana,female,a,g,na
1,Hanny,female,y,h,ny
2,Gerti,female,i,g,ti
3,Dacie,female,e,d,ie
4,Hersh,male,h,h,sh


In [30]:
x3 = pd.get_dummies(df1.Last_two_Letter,drop_first=True,prefix='LTL')

In [31]:
X1 = pd.concat([x1,x3],axis=1)

In [32]:
X1.head()

Unnamed: 0,FL_b,FL_c,FL_d,FL_e,FL_f,FL_g,FL_h,FL_i,FL_j,FL_k,...,LTL_yl,LTL_ym,LTL_yn,LTL_ys,LTL_yt,LTL_za,LTL_ze,LTL_zi,LTL_zo,LTL_zy
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
X_train,X_test,y_train,y_test = train_test_split(
    X1,y,test_size=0.33,random_state=None)

In [34]:
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [35]:
clf.score(X_test,y_test)

0.8032036613272311

## Document Classification

In [36]:
from nltk.corpus import movie_reviews

In [37]:
#create a document for all movies revies, both negative and positive
documents = [(' '.join(movie_reviews.words(fileid)),category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]
#reshuffle the documents so that the each rating is spread out from similar ones
random.shuffle(documents)

In [38]:
#break the list of review-rating pair into individual entities
review = [rev for rev,rating in documents]
rating = [rating for rev,rating in documents]

In [39]:
#create a dataframe and assign each of the above to it's own column
reviews_df = pd.DataFrame()
reviews_df['Review'] = review
reviews_df['Rating'] = rating

In [40]:
#let us view our dataframe
reviews_df.head()

Unnamed: 0,Review,Rating
0,kate ( jennifer aniston ) is having some probl...,neg
1,i think of i know what you did last summer as ...,neg
2,"as fairy tales go , cinderella has to be one o...",pos
3,almost a full decade before steven spielberg '...,pos
4,""" have you ever heard the one about a movie so...",neg


In [41]:
import string

In [42]:
#let us remove all the punctuations
reviews_df['Review_Clean'] = reviews_df.Review.apply(
    lambda x:' '.join(t for t in x.split() if t not in string.punctuation))

In [43]:
reviews_df.head()

Unnamed: 0,Review,Rating,Review_Clean
0,kate ( jennifer aniston ) is having some probl...,neg,kate jennifer aniston is having some problems ...
1,i think of i know what you did last summer as ...,neg,i think of i know what you did last summer as ...
2,"as fairy tales go , cinderella has to be one o...",pos,as fairy tales go cinderella has to be one of ...
3,almost a full decade before steven spielberg '...,pos,almost a full decade before steven spielberg s...
4,""" have you ever heard the one about a movie so...",neg,have you ever heard the one about a movie so b...


In [44]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [45]:
#using CountVectorizer produced less accurate results
vector = TfidfVectorizer(stop_words='english')

In [46]:
X = reviews_df.Review_Clean
y = reviews_df.Rating

In [47]:
X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.33,random_state=None)

In [48]:
X_train_bow = vector.fit_transform(X_train).toarray()
X_test_bow = vector.transform(X_test).toarray()

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [50]:
clf = MultinomialNB()

In [51]:
clf.fit(X_train_bow,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [52]:
clf.score(X_test_bow,y_test)

0.8015151515151515