## Step 05 - Implementing StopWords on regional Technical Requirements ##

In this section we will go through the same process and implement StopWords on the regional Technical Requirements. 

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words
import seaborn as sns
cvec = CountVectorizer(stop_words = stop_words.ENGLISH_STOP_WORDS)

In [2]:
custom_stopwords = ['000', '01', '06', '08','10254', '12', '15',
                   '19', '2018', '22', '25', '28', '45', '500',
                   'cox', 'norfolk', 'apply', 'com', 'www', 'applications', 'application',
                   'applicants', 'southern', 'https', 'ia', 'var', 'indeedapply', 'env',
                   'atlanta', 'opportunity', 'iip', 'gender', 'location', 'new', 'employer',
                   'midtown', 'manheim', 'ml', 'including', 'llc', 'truck', 'automotive', 'nationality', 
                   'nation', 'iot', 'kelley', 'hopea', 'date', 'incadea', 'honeywell', '100', '1372', '27', '300',
                   '30308', '30309', '59', '60', '666', '715', '800', '850', '89', '90', 'ga', 'geo', 'genetic',
                    'mercedes', 'marta', 'lunch', 'familimarity', 'fitting', 'floors', 'furthermore', 'living', 
                    'make', 'members', 'family', 'req149533', 'requisition', 'freshman', 'sophomore', 'et', 'etc',
                    'etl', 'job', 'invest', 'member', 'eye', 'relocation', 'Unnamed', 'wework', 'yarn', 'yrs',
                    'test', 'intent', 'intermediete', 'key', 'inflection', 'informatica', 'way', 'recent', 'fewer',
                    'iteratively', 'joining', 'd3', 'bi', 'bs', 'alteryx', 'benz', 'ai', 'arcgis', 'talend', 'al',
                    'bus', 'cassandra', 'growing', 'growth', 'guidance', 'bigdata', 'bigquery', 'cotiviti', 
                    'councils', 'like', 'located', 'devops', 'usa', 'winning', 'ex', 'awesome', 'address', 
                    'assurance', 'pig', 'needed', 'id', 'integral', 'impeccable', 'arts', 'auditing', 'community',
                    'commuter', 'jobs', 'help', 'js', 'human', 'variety', 'stipend', 'rewards', 'sharting', 
                    'daimler', 'degreepreferred', 'advisors', 'characteristics', 'draw', 'donor', 'creek', 'dental',
                    'medical', 'survival', '0064382_p0223181', '10', '1553', '2016', '24', '30327', '401',
                    'experiencepredictive', 'emory', 'caffe2', 'caffe', 'workingmother',]

**Using min_df**

Again, note the use of min_df.

In [3]:
# new list = default list stop_words.ENGLISH_STOP_WORDS
# new list.append(whatever)

# cvec = CountVectorizer(stop_words = new list)

new_stop = list(stop_words.ENGLISH_STOP_WORDS)

new_stop.extend(custom_stopwords)


cvec = CountVectorizer(stop_words = new_stop, min_df=15)

In [4]:
bullets = pd.read_csv('Bullets')
bullets

Unnamed: 0,0
0,5-7 years of Data Modeling and Machine Learnin...
1,5-7 years of Python & R- Language Experience
2,5-7 years of HADOOP-BIGDATA experience
3,Proficiency dealing with extraction and manipu...
4,3 -5 years of experience in advanced statistic...
5,3-5 years of experience of learning systems to...
6,3-5 years of experience of analyzing large dat...
7,3-5 years of experience building large data se...
8,Experience developing custom data models to dr...
9,Experience working in a cross-functional envir...


In [5]:
bullets['0'][6]

'3-5 years of experience of analyzing large data sets to develop custom models and algorithms to drive business solutions.'

`cvec.fit()` will be expecting a pandas Series of text objects:

In [6]:
cvec.fit(bullets['0'])
new_corpus = cvec.transform(bullets['0'])
new_corpus

<447x24 sparse matrix of type '<class 'numpy.int64'>'
	with 693 stored elements in Compressed Sparse Row format>

In [7]:
new_corpus.todense()

matrix([[0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 1],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [8]:
df  = pd.DataFrame(new_corpus.todense(),
                   columns=cvec.get_feature_names())

df.head()

df.sort_values('data', ascending=False).head(10)

Unnamed: 0,ability,advanced,algorithms,analytics,business,data,degree,engineering,experience,learning,...,salaries,science,scientist,skills,sql,statistical,statistics,techniques,tools,years
284,0,0,1,0,0,3,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
440,0,0,1,0,0,3,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
223,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250,0,0,1,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
54,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190,0,0,0,0,2,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
218,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
381,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df.head()

Unnamed: 0,ability,advanced,algorithms,analytics,business,data,degree,engineering,experience,learning,...,salaries,science,scientist,skills,sql,statistical,statistics,techniques,tools,years
0,0,0,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,2,0,1,0,1


**Exporting DataFrame to CSV**

In [10]:
df.to_csv('Technical_Word_Rank.csv')

In [11]:
df.sum().sort_values(ascending=False)

data           151
experience      70
scientist       38
learning        34
years           31
machine         29
models          28
science         28
business        27
ability         26
techniques      26
modeling        24
salaries        23
statistical     22
python          21
advanced        20
skills          19
engineering     19
sql             18
tools           17
degree          16
analytics       16
algorithms      16
statistics      15
dtype: int64

In [12]:
#clean it up by importing STOP Words

from sklearn.feature_extraction import stop_words
 
print(stop_words.ENGLISH_STOP_WORDS)

frozenset({'seemed', 'among', 'already', 'this', 'were', 'beyond', 'ie', 'any', 'amoungst', 'might', 'whom', 'now', 'their', 'twelve', 'afterwards', 'something', 'up', 'move', 'even', 'when', 'alone', 'fifty', 'front', 'own', 'find', 'the', 'there', 'be', 'nowhere', 'de', 'whatever', 'about', 'off', 'whenever', 'can', 'sometime', 'made', 'six', 'therein', 'whither', 'you', 'always', 'latter', 'behind', 'perhaps', 'ltd', 'hereupon', 'namely', 'sometimes', 'who', 'we', 'most', 'down', 'yours', 'within', 'become', 'see', 'under', 'through', 'whereas', 'along', 'somewhere', 'again', 'that', 'put', 'beside', 'none', 'serious', 'part', 'hasnt', 'an', 'from', 'i', 'hence', 'here', 'thence', 'get', 'enough', 'twenty', 'fill', 'towards', 'around', 'had', 'many', 'becomes', 'to', 'everyone', 'formerly', 'another', 'before', 'often', 'every', 'mostly', 'thick', 'wherever', 'show', 'across', 'neither', 'others', 'thru', 'may', 'whereafter', 'whose', 'are', 'yet', 'been', 'nevertheless', 'please', 

In [13]:
df.sum().sort_values(ascending=False)

data           151
experience      70
scientist       38
learning        34
years           31
machine         29
models          28
science         28
business        27
ability         26
techniques      26
modeling        24
salaries        23
statistical     22
python          21
advanced        20
skills          19
engineering     19
sql             18
tools           17
degree          16
analytics       16
algorithms      16
statistics      15
dtype: int64

In [14]:
#save cleaned up dataframe

df.to_csv('Word Rank')

In [15]:
df1 = pd.read_csv('Word Rank')

In [16]:
df1.sum().sort_values(ascending=False)

Unnamed: 0     99681
data             151
experience        70
scientist         38
learning          34
years             31
machine           29
science           28
models            28
business          27
techniques        26
ability           26
modeling          24
salaries          23
statistical       22
python            21
advanced          20
skills            19
engineering       19
sql               18
tools             17
degree            16
analytics         16
algorithms        16
statistics        15
dtype: int64

In [17]:
list(df1.columns.values)


['Unnamed: 0',
 'ability',
 'advanced',
 'algorithms',
 'analytics',
 'business',
 'data',
 'degree',
 'engineering',
 'experience',
 'learning',
 'machine',
 'modeling',
 'models',
 'python',
 'salaries',
 'science',
 'scientist',
 'skills',
 'sql',
 'statistical',
 'statistics',
 'techniques',
 'tools',
 'years']

In [18]:
df1.sum().sort_values(ascending=True)

statistics        15
algorithms        16
analytics         16
degree            16
tools             17
sql               18
engineering       19
skills            19
advanced          20
python            21
statistical       22
salaries          23
modeling          24
ability           26
techniques        26
business          27
models            28
science           28
machine           29
years             31
learning          34
scientist         38
experience        70
data             151
Unnamed: 0     99681
dtype: int64

In [19]:

df_x = df1.sum().sort_values(ascending=True)

**Exporting the cleaned DataFrame to a couple of other CSV's for future use**

In [20]:
df_x.to_csv('Count of Words-Technical.csv')

In [21]:
df_x.to_csv('Technical Text')

In [22]:
df1.sum().sort_values() > 5

statistics     True
algorithms     True
analytics      True
degree         True
tools          True
sql            True
engineering    True
skills         True
advanced       True
python         True
statistical    True
salaries       True
modeling       True
ability        True
techniques     True
business       True
models         True
science        True
machine        True
years          True
learning       True
scientist      True
experience     True
data           True
Unnamed: 0     True
dtype: bool

In [23]:
df1.sum().sort_values()[df1.sum().sort_values() > 15].index

Index(['algorithms', 'analytics', 'degree', 'tools', 'sql', 'engineering',
       'skills', 'advanced', 'python', 'statistical', 'salaries', 'modeling',
       'ability', 'techniques', 'business', 'models', 'science', 'machine',
       'years', 'learning', 'scientist', 'experience', 'data', 'Unnamed: 0'],
      dtype='object')