# NLTK Tutorial using Scikit Learn

## Gender Identification

In [37]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
from nltk.corpus import names
import nltk
import random

In [2]:
#create a list of tupples of name and gender
labeled_names = ([(name,'male') for name in names.words('male.txt')]+
                [(name,'female') for name in names.words('female.txt')])

In [3]:
#shuffle the above collection of name-gender pairs
random.shuffle(labeled_names)

In [49]:
#break it down to individual parts
names = [name for name,gender in labeled_names]
genders = [gender for name,gender in labeled_names]

In [50]:
#create a dataframe with the above information
df = pd.DataFrame()
df['Name'] = names
df['Gender'] = genders

In [51]:
#lets look at how our dataframe looks like
df.head()

Unnamed: 0,Name,Gender
0,Shirley,female
1,Judye,female
2,Eric,male
3,Latia,female
4,Imogene,female


In [52]:
#lets create a copy to modify as we go on
df1 = df.copy()

In [53]:
#create a new column using the name's last letter/character
df1['Last_Letter'] = df1['Name'].apply(lambda x:x[-1])

## Splitting the data 

In [54]:
from sklearn.model_selection import train_test_split

In [87]:
X = df1['Last_Letter']  #we will use the name's last character to train our model
y = df1['Gender']       #get the gender as the label to train our model

In [88]:
#since our models only except numerical values, let us turn our data into 
#one-hot-encoded variables
X = pd.get_dummies(X,drop_first=True,prefix='Last_Letter')

In [89]:
X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.33,random_state=None)

In [90]:
X.head()

Unnamed: 0,Last_Letter_a,Last_Letter_b,Last_Letter_c,Last_Letter_d,Last_Letter_e,Last_Letter_f,Last_Letter_g,Last_Letter_h,Last_Letter_i,Last_Letter_j,...,Last_Letter_p,Last_Letter_r,Last_Letter_s,Last_Letter_t,Last_Letter_u,Last_Letter_v,Last_Letter_w,Last_Letter_x,Last_Letter_y,Last_Letter_z
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Fitting the model

In [57]:
from sklearn.naive_bayes import MultinomialNB

In [71]:
clf = MultinomialNB()

In [72]:
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [73]:
clf.score(X_test,y_test)

0.7440884820747521

## Adding more Features

In [93]:
df1['First_Letter'] = df1['Name'].apply(lambda x:x[0].lower())

In [95]:
df1.tail()

Unnamed: 0,Name,Gender,Last_Letter,First_Letter
7939,Ron,male,n,r
7940,Carie,female,e,c
7941,Aub,male,b,a
7942,Em,female,m,e
7943,Oprah,female,h,o


In [97]:
x1 = pd.get_dummies(df1['First_Letter'],prefix='FL',drop_first=True)
x2 = pd.get_dummies(df1['Last_Letter'],prefix='LL',drop_first=True)

In [99]:
X1 = pd.concat([x1,x2],axis=1)

In [100]:
for letter in 'abcdefghijklmnopqrstuvwxyz':
    X1[letter+'Count'] = df1.Name.str.count(letter)

In [101]:
X1.head()

Unnamed: 0,FL_b,FL_c,FL_d,FL_e,FL_f,FL_g,FL_h,FL_i,FL_j,FL_k,...,qCount,rCount,sCount,tCount,uCount,vCount,wCount,xCount,yCount,zCount
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [104]:
X_train,X_test,y_train,y_test = train_test_split(
    X1,y,test_size=0.33,random_state=None)

In [105]:
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [106]:
clf.score(X_test,y_test)

0.7887109077040427

### Using last two and last one letter(s) 

In [107]:
df1['Last_two_Letter'] = df1['Name'].apply(lambda x:x[-2:])

In [108]:
df1.head()

Unnamed: 0,Name,Gender,Last_Letter,First_Letter,Last_two_Letter
0,Shirley,female,y,s,ey
1,Judye,female,e,j,ye
2,Eric,male,c,e,ic
3,Latia,female,a,l,ia
4,Imogene,female,e,i,ne


In [109]:
x3 = pd.get_dummies(df1.Last_two_Letter,drop_first=True,prefix='LTL')

In [116]:
X1 = pd.concat([x1,x3],axis=1)

In [119]:
X1.head()

Unnamed: 0,FL_b,FL_c,FL_d,FL_e,FL_f,FL_g,FL_h,FL_i,FL_j,FL_k,...,LTL_yl,LTL_ym,LTL_yn,LTL_ys,LTL_yt,LTL_za,LTL_ze,LTL_zi,LTL_zo,LTL_zy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [120]:
X_train,X_test,y_train,y_test = train_test_split(
    X1,y,test_size=0.33,random_state=None)

In [121]:
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [122]:
clf.score(X_test,y_test)

0.7944317315026698

## Document Classification

In [28]:
from nltk.corpus import movie_reviews

In [128]:
#create a document for all movies revies, both negative and positive
documents = [(' '.join(movie_reviews.words(fileid)),category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]
#reshuffle the documents so that the each rating is spread out from similar ones
random.shuffle(documents)

In [130]:
#break the list of review-rating pair into individual entities
review = [rev for rev,rating in documents]
rating = [rating for rev,rating in documents]

In [131]:
#create a dataframe and assign each of the above to it's own column
reviews_df = pd.DataFrame()
reviews_df['Review'] = review
reviews_df['Rating'] = rating

In [132]:
#let us view our dataframe
reviews_df.head()

Unnamed: 0,Review,Rating
0,"an attempt at florida film noir , palmetto fai...",neg
1,""" i ' ve been told by several people , ' you '...",pos
2,a hotshot lawyer gets an obviously guilty chil...,neg
3,if you had a chance to create a genetically pe...,pos
4,"so , it ' s thirty years later , and oscar and...",neg


In [134]:
import string

In [139]:
#let us remove all the punctuations
reviews_df['Review_Clean'] = reviews_df.Review.apply(
    lambda x:' '.join(t for t in x.split() if t not in string.punctuation))

In [140]:
reviews_df.head()

Unnamed: 0,Review,Rating,Review_Clean
0,"an attempt at florida film noir , palmetto fai...",neg,an attempt at florida film noir palmetto fails...
1,""" i ' ve been told by several people , ' you '...",pos,i ve been told by several people you re old fa...
2,a hotshot lawyer gets an obviously guilty chil...,neg,a hotshot lawyer gets an obviously guilty chil...
3,if you had a chance to create a genetically pe...,pos,if you had a chance to create a genetically pe...
4,"so , it ' s thirty years later , and oscar and...",neg,so it s thirty years later and oscar and felix...


In [133]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [180]:
vector = TfidfVectorizer(stop_words='english')

In [181]:
X = reviews_df.Review_Clean
y = reviews_df.Rating

In [182]:
X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.33,random_state=42)

In [183]:
X_train_bow = vector.fit_transform(X_train).toarray()
X_test_bow = vector.transform(X_test).toarray()

In [184]:
from sklearn.ensemble import RandomForestClassifier

In [185]:
clf = MultinomialNB()

In [186]:
clf.fit(X_train_bow,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [187]:
clf.score(X_test_bow,y_test)

0.8151515151515152