# Work with the Resume Text and Navie Bayer in scikit-learn 

## Agenda
### Working with resume data
<ul>
    <li>read data from csv file</li>
    <li>lemmatize and transform data</li>
    <li>split and vectorize data</li>
</ul>

### Navie Bayes classification
<ul>
    <li>Building a Navie Bayes model</li>
    <li>Comparing Navie Bayes with logistic regression</li>
</ul>



In [None]:
# load data from result.csv
import pandas as pd
result = pd.read_csv('result.csv')

# print random sample to check the format
result.sample(5)

# Lemmatize and transform the data

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# define lemma, stem and stopword
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')
words = set(stopwords.words('english'))

# lemmatizer and remove stopword
result['clean'] = result['description'].apply(lambda x: " ".join([lemmatizer.lemmatize(i) for i in x.split() if i not in words]).lower())

# remove the confuse title. i.e. both 'Data Scientist' and 'Data Analyst' in title
result['title_c'] = result['title'].map(lambda x: 1 if 'Data Scientist' in x and 'Data Analyst' in x else 0)
result.drop(result[result.title_c == 1].index, inplace = True)

# define 'Data Scientist' as 1 'Data Analyst' as 0
result['title_c'] = result['title'].map(lambda x: 1 if 'Data Scientist' in x else 0)

# print random sample to check the format
result.sample(5)

# Split and vectorize the data

In [None]:
# load preprocess and fitting module from sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # feature_extraction
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB # navie_bayes
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression # logisticRegression
from sklearn.svm import SVC # SVM
from sklearn.pipeline import Pipeline

# build a pipeline
text_clf = Pipeline([('vect', CountVectorizer(binary=True)),
                     ('clf', BernoulliNB())
                    ])

# split (x,y) get train and test set
X_train, X_test, y_train, y_test = train_test_split(result['clean'], result.title_c, test_size=0.2)
text_clf.fit(X_train, y_train)

# Perform Navie_bayes and the results

In [None]:
# predict y_pred from X_test using the training result
y_pred_class = text_clf.predict(X_test)

# print classification_report
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred_class))

In [None]:
# print confusion_matrix
metrics.confusion_matrix(y_test, y_pred_class)