### Neccessary Imports

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/neil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Importing the dataset

In [3]:
df = pd.read_csv(r'Resume_Data.csv', encoding = 'utf-8')
df['Cleaned_Resume'] = ''

FileNotFoundError: [Errno 2] No such file or directory: 'Resume_Data.csv'

### Exploratory Data Analysis

In [None]:
df.head()

In [None]:
print("Resume Categories")
print(df['Category'].value_counts())

#### Visualizing types of people who have given the resume

In [None]:
plt.figure(figsize = (10, 10))                                          # Setting size of plot
plt.xticks(rotation = 90)                                               # Rotating plot to organize horizontally
sns.countplot(y = 'Category', data = df)                                # Deciding which column of Dataframe will the source for plot

### Data Cleaning

In [None]:
def Clean_Resume(resumeText):
    Removals = [                                                        # Deciding weeds in resume
        'http\S+\s*',                                                   # Web URLs
        'RT|cc',                                                        # Regular characters
        '#\S+',                                                         # Hashtags
        '@\S+',                                                         # Emails
        '\s+'
    ]
    
    for weed in Removals: resumeText = re.sub(weed, ' ', resumeText)    # Removing weeds using regular expression
    resumeText = re.sub('[%s]'%re.escape("""!"#$%&'_=-+()[];:,./?^*@{}|\~"""), ' ', resumeText)
    resumeText = re.sub(r'[^x00-x7f]', r' ', resumeText)
    
    return resumeText

In [None]:
df['Cleaned_Resume'] = df.Resume.apply(lambda x: Clean_Resume(x))
df.head()

In [None]:
corpus = ''
for i in range(len(df)): corpus += df['Cleaned_Resume'][i]
corpus[450:1000]

### Creating the Tokenizer and Tokenizing

In [None]:
tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
tokens = tokenizer.tokenize(corpus)                                     # Tokenizing the text into individual words

words = [word.lower() for word in tokens]                               # Transforming all words to lowercase
print(len(words))

### Fetching English Stop Words

In [None]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

### Removing Stop words

In [None]:
words_new = [
    word
    for word in words
    if word not in stopwords
]

In [None]:
len(words_new)

### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

lem_words = [
    wnl.lemmatize(word)
    for word in words_new
]

In [None]:
same=0
diff=0
for i in range(0,1832):
    if(lem_words[i]==words_new[i]):
        same=same+1
    elif(lem_words[i]!=words_new[i]):
        diff=diff+1
print('Number of words Lemmatized=', diff)
print('Number of words not Lemmatized=', same)

In [None]:
freq_dist = nltk.FreqDist(lem_words)
plt.subplots(figsize=(20,12))
freq_dist.plot(30)

In [None]:
mostcommon = freq_dist.most_common(50)
mostcommon

In [None]:
res=' '.join([i for i in lem_words if not i.isdigit()])

In [None]:
plt.subplots(figsize=(16,10))
wordcloud = WordCloud(
                          background_color='black',
                          max_words=200,
                          width=1400,
                          height=1200
                         ).generate(res)
plt.imshow(wordcloud)
plt.title('Resume Text WordCloud (100 Words)')
plt.axis('off')
plt.show()

In [None]:
df