# Import Required Libraries
Import the necessary libraries, including pandas and numpy.

In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np

# Load the CSV File
Load the raw CSV file into a pandas DataFrame.

In [2]:
# Load the CSV File

# Load the raw CSV file into a pandas DataFrame
df = pd.read_csv('../scraped_jobs.csv')

# Display the first few rows of the DataFrame to verify the data is loaded correctly
df.head()

Unnamed: 0,Position,Company,Location,Link,Uploaded On,Opportunity Type,Status,Applied,Application Deadline,Impressions,Eligibility
0,Merchandise / Retail Planner,Fashor Lifestyle Pvt. Ltd.,,https://unstop.com/jobs/merchandise-retail-pla...,2024-10-09T00:00:00+05:30,,open,0,62.0,43,Experienced Professionals
1,Customer Success Executive,Planet nextgen technologies,,https://unstop.com/jobs/customer-success-execu...,2024-10-09T00:00:00+05:30,,open,0,62.0,50,"Fresher, Undergraduate"
2,Direct Sales Executive,NTH Iconic Party Solutions Private Limited,,https://unstop.com/jobs/direct-sales-executive...,2024-10-09T00:00:00+05:30,,open,0,62.0,717,Experienced Professionals
3,Customer Service Representative,Career Comfort Solutions,,https://unstop.com/jobs/customer-service-repre...,2024-10-08T00:00:00+05:30,,open,50,62.0,5923,"Fresher, Engineering Students, MBA Students, P..."
4,Operation Manager,Defitex Innovative Solutions Private Limited,,https://unstop.com/jobs/operation-manager-defi...,2024-10-08T00:00:00+05:30,,open,2,62.0,5335,Experienced Professionals


# Inspect the Data
Inspect the first few rows of the DataFrame and check for any obvious issues.

In [3]:
# Inspect the Data

# Display the first few rows of the DataFrame to verify the data is loaded correctly
df.head()

# Check for any obvious issues such as missing values or incorrect data types
df.info()

# Display summary statistics of the DataFrame
df.describe(include='all')

# Check for missing values in each column
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30072 entries, 0 to 30071
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Position              30072 non-null  object 
 1   Company               30072 non-null  object 
 2   Location              22691 non-null  object 
 3   Link                  30072 non-null  object 
 4   Uploaded On           30072 non-null  object 
 5   Opportunity Type      22914 non-null  object 
 6   Status                30072 non-null  object 
 7   Applied               30072 non-null  int64  
 8   Application Deadline  17079 non-null  float64
 9   Impressions           30072 non-null  int64  
 10  Eligibility           30071 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 2.5+ MB


Position                    0
Company                     0
Location                 7381
Link                        0
Uploaded On                 0
Opportunity Type         7158
Status                      0
Applied                     0
Application Deadline    12993
Impressions                 0
Eligibility                 1
dtype: int64

# Handle Missing Values
Identify and handle missing values in the DataFrame.

In [4]:
# Handle Missing Values

# Identify columns with missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values

# Handle missing values by filling them with appropriate values or dropping rows/columns
# For this example, we'll fill missing values with a placeholder or drop them if necessary

# Fill missing values in 'Location' column with 'Unknown'
df['Location'].fillna('Unknown', inplace=True)

# Drop rows where 'Link' is missing since it's a critical field
df.dropna(subset=['Link'], inplace=True)

# Verify that there are no more missing values
df.isnull().sum()

Position                    0
Company                     0
Location                    0
Link                        0
Uploaded On                 0
Opportunity Type         7158
Status                      0
Applied                     0
Application Deadline    12993
Impressions                 0
Eligibility                 1
dtype: int64

# Convert Date Columns to Datetime
Convert the 'Uploaded On' column to datetime format.

In [5]:
# Convert Date Columns to Datetime

# Convert the 'Uploaded On' column to datetime format
df['Uploaded On'] = pd.to_datetime(df['Uploaded On'])

# Verify the conversion by checking the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30072 entries, 0 to 30071
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype                                
---  ------                --------------  -----                                
 0   Position              30072 non-null  object                               
 1   Company               30072 non-null  object                               
 2   Location              30072 non-null  object                               
 3   Link                  30072 non-null  object                               
 4   Uploaded On           30072 non-null  datetime64[ns, pytz.FixedOffset(330)]
 5   Opportunity Type      22914 non-null  object                               
 6   Status                30072 non-null  object                               
 7   Applied               30072 non-null  int64                                
 8   Application Deadline  17079 non-null  float64                              


# Filter Data Based on Criteria
Filter the DataFrame based on specific criteria, such as 'Status' being 'open'.

In [6]:
# Filter Data Based on Criteria

# Filter the DataFrame to include only rows where 'Status' is 'open'
filtered_df = df[df['Status'] == 'open']

# Display the first few rows of the filtered DataFrame to verify the filtering
filtered_df.head()

Unnamed: 0,Position,Company,Location,Link,Uploaded On,Opportunity Type,Status,Applied,Application Deadline,Impressions,Eligibility
0,Merchandise / Retail Planner,Fashor Lifestyle Pvt. Ltd.,Unknown,https://unstop.com/jobs/merchandise-retail-pla...,2024-10-09 00:00:00+05:30,,open,0,62.0,43,Experienced Professionals
1,Customer Success Executive,Planet nextgen technologies,Unknown,https://unstop.com/jobs/customer-success-execu...,2024-10-09 00:00:00+05:30,,open,0,62.0,50,"Fresher, Undergraduate"
2,Direct Sales Executive,NTH Iconic Party Solutions Private Limited,Unknown,https://unstop.com/jobs/direct-sales-executive...,2024-10-09 00:00:00+05:30,,open,0,62.0,717,Experienced Professionals
3,Customer Service Representative,Career Comfort Solutions,Unknown,https://unstop.com/jobs/customer-service-repre...,2024-10-08 00:00:00+05:30,,open,50,62.0,5923,"Fresher, Engineering Students, MBA Students, P..."
4,Operation Manager,Defitex Innovative Solutions Private Limited,Unknown,https://unstop.com/jobs/operation-manager-defi...,2024-10-08 00:00:00+05:30,,open,2,62.0,5335,Experienced Professionals


# Save the Cleaned Data
Save the cleaned DataFrame to a new CSV file.

In [7]:
# Save the Cleaned Data

# Save the cleaned DataFrame to a new CSV file
filtered_df.to_csv('cleaned_jobs.csv', index=False)