# INTRODUCTION<br><br>
**In this kernel, we will see Natural Language Processing(NLP).**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

### Import Data 

In [None]:
# import twitter data
data = pd.read_csv("../input/gender-classifier-DFE-791531.csv",encoding="latin1")

In [None]:
#Concatenate the data attributes gender and description

data = pd.concat([data.gender,data.description],axis=1)

#Drop the NaN values

data.dropna(axis=0,inplace=True)

In [None]:
# Inspect  the data
data.head()

In [None]:
#Retrieve the shape of the dataframe 
data.shape

In [None]:
#Define the data gender variable (1 for female and 0 for male)

data.gender = [1 if each == "female" else 0 for each in data.gender]

#Inspect the data again(take 10 values)
data.head(10)


In [None]:
# Retrieve any one description 
# We will first employ the cleaning of the text for one description and
# then clean all the descriptions

data.description[4]

### Regular Expression

In [None]:
# Regular expression to be used RE =>> "[^a-zA-Z]"
#Import the regex library
import re

# Use re.sub to filter out the text from the raw data
# Also make sure all the text is lower case.

first_description = data.description[4]
description = re.sub("[^a-zA-Z]"," ",first_description)
description = description.lower() #Year year are different words
description

### Stopwords

In [None]:
#Use nltk libraries to remove irrelevant words known as stopwords
#Import the libraries
import nltk
from nltk.corpus import stopwords

#remove irrelavent words for e.g. and,the ...
#Tokenize the text using word_tokenize 
#You can also use .split function and then tokenize the words
#if we use word_tokenize instead of split it will be better
#split() = shouldn't => shouldn't
#word_tokenize() = shouldn't => shouldn't and n't separate as two word
#Tolenize the text

description = nltk.word_tokenize(description)


# Filter the stopwords from the text and print the filtered text.


description = [word for word in description if not word in set(stopwords.words("english"))]
description

### Lemmatization

In [None]:
#Lemmatization = loved => love

#import the nltk library
import nltk as nlp

#Employ lemmatization to the text


lemma = nlp.WordNetLemmatizer()
description = [lemma.lemmatize(word) for word in description]
description

In [None]:
# Rejoin the tokenized words into a corpus and print the cleaned text

description = " ".join(description)
description

### Apply to All Descriptions

In [None]:
#Apply everything you did above to all the remaining descriptions

description_list = []
for description in data.description:
    description = re.sub("[^a-zA-Z]"," ",description)
    description = description.lower()
    description = nltk.word_tokenize(description)
    #description = [ word for word in description if not word in set(stopwords.words("english"))]
    lemma = nlp.WordNetLemmatizer()
    description = [lemma.lemmatize(word) for word in description]
    description = " ".join(description)
    description_list.append(description)
    
#description_list

### Bag of Words

In [None]:
#Import the sklearn feature extraction from text library to vectorize the count

from sklearn.feature_extraction.text import CountVectorizer

# Define max_features as the maximum number of features you want to employ.
# Use the count vectorizer after defining a specific number as max_fatures
max_features = 1000
count_vectorizer = CountVectorizer(max_features=max_features,stop_words = "english")
#count_vectorizer = CountVectorizer(stop_words = "english")

#Define the sparce matrix
sparce_matrix = count_vectorizer.fit_transform(description_list).toarray() # x

# Print the most common words

print("{} most common words: {}".format(max_features,count_vectorizer.get_feature_names()))

### Train and Test Split

In [None]:
#Define the x and y variables

y = data.iloc[:,0].values   # male or female classes
x = sparce_matrix
# train test split (80-20)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.1, random_state = 42)

### Apply Naive Bayes Machine Learning Algorithm


In [None]:
#Apply Naive Bayes


from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

nb.fit(x_train,y_train)

print("accuracy: ",nb.score(x_test,y_test))


In [None]:
# Plot the Confusion Matrix 

y_pred = nb.predict(x_test)
y_true = y_test

from sklearn.metrics import confusion_matrix

cm_nb = confusion_matrix(y_true,y_pred)

sns.heatmap(cm_nb,annot=True,cmap="RdPu",fmt=".0f",cbar=False)
plt.show()

### Apply Random Forest Machine Learning Algorithm

In [None]:
# Apply Random Forest Machine Learning Algorithm

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100)

rf.fit(x_train,y_train)

print("accuracy: ",rf.score(x_test,y_test))

In [None]:
# Plot the Confusion Matrix for Random Forest


y_pred = rf.predict(x_test)
y_true = y_test

from sklearn.metrics import confusion_matrix

cm_rf = confusion_matrix(y_true,y_pred)

sns.heatmap(cm_rf,annot=True,cmap="RdPu",fmt=".0f",cbar=False)
plt.show()