In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


rdf = pd.read_csv('r.csv')
jddf = pd.read_csv('s.csv')

# 1. Check for missing values in the datasets
print("\nMissing values in resume dataset: ")
print(rdf.isnull().sum())

print("\nMissing values in job descriptions dataset:")
print(jddf.isnull().sum())

# 2. Statistics Summary
print("\nResumes dataset summary:")
print(rdf.describe())
print("\nJob Descriptions dataset summary:")
print(jddf.describe())

# 3. Distribution of text length in resumes and job descriptions


rdf['Combined'] = rdf['Extracted Skills']
jddf['Combined'] = jddf['Skills']

# Plot distribution of resume lengths
rl = rdf['Skills'].str.len()

plt.figure(figsize=(10, 6))
sns.histplot(rl, kde=True, color='blue', bins=20)
plt.title('Distribution of Resume Lengths')
plt.xlabel('Number of Words in Resume')
plt.ylabel('Frequency')
plt.show()

# Plot distribution of job description lengths
jdl = jddf['Job Description'].str.len()

plt.figure(figsize=(10, 6))
sns.histplot(jdl, kde=True, color='green', bins=20)
plt.title('Distribution of Job Description Lengths')
plt.xlabel('Number of Words in Job Description')
plt.ylabel('Frequency')
plt.show()

# 4. Find most common words in resumes and job descriptions
from wordcloud import WordCloud

# Word cloud for resumes
rt = ' '.join(rdf['Combined'])
wcr = WordCloud(width=800, height=400, background_color='white').generate(rt)

plt.figure(figsize=(10, 6))
plt.imshow(wcr, interpolation='bilinear')
plt.title('Most Common Words in Resumes')
plt.axis('off')
plt.show()

# Word cloud for job descriptions
jdt = ' '.join(job_descriptions_df['Combined'])
wcj = WordCloud(width=800, height=400, background_color='white').generate(jdt)

plt.figure(figsize=(10, 6))
plt.imshow(wcj, interpolation='bilinear')
plt.title('Most Common Words in Job Descriptions')
plt.axis('off')
plt.show()

# 5. Distribution of job titles in the job descriptions dataset
plt.figure(figsize=(12, 8))
jtc = jddf['Job Title '].value_counts()
sns.barplot(x=jtc.index, y=jtc.values, palette='viridis')
plt.title('Distribution of Job Titles in Job Descriptions')
plt.xlabel('Job Title')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()

# 6. Unique count of number of skills in resumes and job descriptions
rsc = rdf['Extracted Skills'].apply(lambda x: len(str(x).split(','))).describe()
jdsc = jddf['Skills'].apply(lambda x: len(str(x).split(','))).describe()

print(f"\nResume skills count :\n{rsc}")
print(f"\nJob description skills count :\n{jdsc}")
