# Work with the Resume Text and LinearSVC in scikit-learn 

## Agenda
### Working with resume data
<ul>
    <li>read data from csv file</li>
    <li>lemmatize and transform data</li>
    <li>split and vectorize data</li>
</ul>

### LinearSVC classification
<ul>
    <li>Building a LinearSVC model</li>
    <li>Comparing LinearSVC with logistic regression</li>
</ul>



In [None]:
# load data from result.csv
import pandas as pd
result = pd.read_csv('c:/python/result.csv')

# print random sample to check the format
result.sample(5)

# Lemmatize and transform the data

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# define lemma, and stopword
lemmatizer = WordNetLemmatizer()
words = set(stopwords.words('english'))

# lemmatizer and remove stopword
result['clean'] = result['description'].apply(lambda x: " ".join([lemmatizer.lemmatize(i) for i in x.split() if i not in words]).lower())

# remove the confuse titles. i.e. both 'Data Scientist' and 'Data Analyst' in title
result['title_c'] = result['title'].map(lambda x: 1 if 'Data Scientist' in x and 'Data Analyst' in x else 0)
result.drop(result[result.title_c == 1].index, inplace = True)

# define 'Data Scientist' as 1 'Data Analyst' as 0
result['title_c'] = result['title'].map(lambda x: 1 if 'Data Scientist' in x else 0)

# print random sample to check the format
result.sample(5)

# Split and vectorize the data

In [None]:
# load preprocess and fitting module from sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # feature_extraction
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB # navie_bayes
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression # logisticRegression
from sklearn.svm import SVC, LinearSVC # SVM
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# build a pipeline
text_clf = Pipeline([('vect', TfidfVectorizer(stop_words = 'english', ngram_range=(1, 2))),
                     ('clf',LinearSVC())
                    ])

# split (x,y) get train and test set
X_train, X_test, y_train, y_test = train_test_split(result['clean'], result.title_c, test_size=0.2, random_state=1)
text_clf.fit(X_train, y_train)

# Perform LinearSVC and the results

In [None]:
# predict y_pred from X_test using the training result
y_pred_class = text_clf.predict(X_test)

# print classification_report
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred_class))

In [None]:
# print confusion_matrix
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
#  get feature names for SVM fitting
X_train_tokens = text_clf.named_steps['vect'].get_feature_names()
len(X_train_tokens)

In [None]:
# define a dict to store feature and related importance
feature = {}
import matplotlib.pyplot as plt
# plot top related features for svm
import numpy as np
# define function to plot features
def plot_coefficients(classifier, feature_names, top_features=25):
    # get the coef paratmeter and store to coef
    coef = classifier.coef_.ravel()
    # return the index of top positive and negative paratmeter
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    # plot
    plt.figure(figsize=(15, 5))
    colors =['red' if c<0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2*top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1+2*top_features), feature_names[top_coefficients],rotation=60, ha='right')
    plt.show()
    # store top correlated parameter to feature.
    for i in top_coefficients[:25]:
        feature[feature_names[i]] = -coef[i]
        
cv = text_clf.named_steps['vect']
svm = text_clf.named_steps['clf']
plot_coefficients(svm, cv.get_feature_names())

# Make a cloudword

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

%matplotlib inline
wordcloud = WordCloud(width=800, height=500)

    
wordcloud.generate_from_frequencies(frequencies=feature)
plt.figure(figsize=(16,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
