# Import Required Libraries
Import necessary libraries such as pandas, numpy, and matplotlib.

In [12]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the Data
Load the CSV file into a pandas DataFrame.

In [13]:
# Load the Data
df = pd.read_csv('../scraped_internships.csv')

# Display the first few rows of the DataFrame to verify loading
df.head()

Unnamed: 0,Position,Company,Location,Link,Uploaded On,Opportunity Type,Status,Applied,Application Deadline,Impressions,Eligibility
0,Business Development and Sales Turnaround Inte...,Revv Self Drive Car Rental,,https://unstop.com/internships/business-develo...,2024-10-08T00:00:00+05:30,,open,1,28.0,289,"Fresher, MBA Students, Undergraduate, Postgrad..."
1,HR Recruiter Internship,earlyjobs,,https://unstop.com/internships/hr-recruiter-in...,2024-10-09T00:00:00+05:30,,open,2,62.0,512,"Undergraduate, Fresher"
2,HR Operations Internship,DesiDap,,https://unstop.com/internships/hr-operations-i...,2024-10-09T00:00:00+05:30,,open,2,62.0,1551,"Undergraduate, Fresher"
3,Social Media Marketing Internship,DesiDap,,https://unstop.com/internships/social-media-ma...,2024-10-09T00:00:00+05:30,,open,13,62.0,1838,"Fresher, Engineering Students, MBA Students, U..."
4,Sales and Marketing Internship,Underlinen Fashion House,,https://unstop.com/internships/sales-and-marke...,2024-10-09T00:00:00+05:30,,open,1,29.0,1728,Fresher


# Explore the Data
Perform initial exploration of the data, including checking the shape, data types, and summary statistics.

In [14]:
# Explore the Data

# Check the shape of the DataFrame
df.shape

# Check the data types of each column
df.dtypes

# Get summary statistics of the DataFrame
df.describe(include='all')

# Check for missing values in the DataFrame
df.isnull().sum()

# Display the column names
df.columns

# Display the first few rows of the DataFrame again for reference
df.head()

Unnamed: 0,Position,Company,Location,Link,Uploaded On,Opportunity Type,Status,Applied,Application Deadline,Impressions,Eligibility
0,Business Development and Sales Turnaround Inte...,Revv Self Drive Car Rental,,https://unstop.com/internships/business-develo...,2024-10-08T00:00:00+05:30,,open,1,28.0,289,"Fresher, MBA Students, Undergraduate, Postgrad..."
1,HR Recruiter Internship,earlyjobs,,https://unstop.com/internships/hr-recruiter-in...,2024-10-09T00:00:00+05:30,,open,2,62.0,512,"Undergraduate, Fresher"
2,HR Operations Internship,DesiDap,,https://unstop.com/internships/hr-operations-i...,2024-10-09T00:00:00+05:30,,open,2,62.0,1551,"Undergraduate, Fresher"
3,Social Media Marketing Internship,DesiDap,,https://unstop.com/internships/social-media-ma...,2024-10-09T00:00:00+05:30,,open,13,62.0,1838,"Fresher, Engineering Students, MBA Students, U..."
4,Sales and Marketing Internship,Underlinen Fashion House,,https://unstop.com/internships/sales-and-marke...,2024-10-09T00:00:00+05:30,,open,1,29.0,1728,Fresher


# Handle Missing Values
Identify and handle missing values in the dataset.

In [15]:
# Handle Missing Values

# Identify columns with missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values

# Handle missing values by filling with appropriate values or dropping
# For this example, we'll fill missing values with a placeholder or drop them based on the context

# Fill missing values in 'Location' with 'Unknown'
df['Location'].fillna('Unknown', inplace=True)

# Drop rows where 'Link' is missing as it's crucial for the dataset
df.dropna(subset=['Link'], inplace=True)

# For 'Application Deadline', fill missing values with a placeholder date
df['Application Deadline'].fillna('No Deadline', inplace=True)

# Verify that there are no more missing values
df.isnull().sum()

Position                    0
Company                     0
Location                    0
Link                        0
Uploaded On                 0
Opportunity Type        14231
Status                      0
Applied                     0
Application Deadline        0
Impressions                 0
Eligibility                 0
dtype: int64

# Convert Data Types
Convert columns to appropriate data types, such as dates and categorical variables.

In [16]:
# Convert Data Types

# Convert 'Uploaded On' and 'Application Deadline' to datetime, handling errors for non-date values
df['Uploaded On'] = pd.to_datetime(df['Uploaded On'], errors='coerce')
df['Application Deadline'] = pd.to_datetime(df['Application Deadline'], errors='coerce')

# Convert 'Opportunity Type' and 'Status' to categorical data type
df['Opportunity Type'] = df['Opportunity Type'].astype('category')
df['Status'] = df['Status'].astype('category')

# Verify the data types after conversion
df.dtypes

Position                                               object
Company                                                object
Location                                               object
Link                                                   object
Uploaded On             datetime64[ns, pytz.FixedOffset(330)]
Opportunity Type                                     category
Status                                               category
Applied                                                 int64
Application Deadline                           datetime64[ns]
Impressions                                             int64
Eligibility                                            object
dtype: object

# Feature Engineering
Create new features from existing data to enhance the dataset.

In [17]:
# Feature Engineering

# Convert 'Uploaded On' and 'Application Deadline' columns to timezone-naive datetime objects
df['Uploaded On'] = pd.to_datetime(df['Uploaded On']).dt.tz_localize(None)
df['Application Deadline'] = pd.to_datetime(df['Application Deadline']).dt.tz_localize(None)

# Create a new feature 'Days Until Deadline' which calculates the number of days from 'Uploaded On' to 'Application Deadline'
df['Days Until Deadline'] = (df['Application Deadline'] - df['Uploaded On']).dt.days

# Create a new feature 'Is Remote' based on the 'Location' column
df['Is Remote'] = df['Location'].apply(lambda x: 'Remote' in x)

# Create a new feature 'Total Impressions' by summing up the 'Impressions' column
df['Total Impressions'] = df['Impressions'].sum()

# Create a new feature 'Eligibility Count' which counts the number of eligibility criteria
df['Eligibility Count'] = df['Eligibility'].apply(lambda x: len(x.split(',')))

# Display the first few rows of the DataFrame to verify the new features
df.head()

Unnamed: 0,Position,Company,Location,Link,Uploaded On,Opportunity Type,Status,Applied,Application Deadline,Impressions,Eligibility,Days Until Deadline,Is Remote,Total Impressions,Eligibility Count
0,Business Development and Sales Turnaround Inte...,Revv Self Drive Car Rental,Unknown,https://unstop.com/internships/business-develo...,2024-10-08,,open,1,1970-01-01 00:00:00.000000028,289,"Fresher, MBA Students, Undergraduate, Postgrad...",-20004.0,False,203886992,4
1,HR Recruiter Internship,earlyjobs,Unknown,https://unstop.com/internships/hr-recruiter-in...,2024-10-09,,open,2,1970-01-01 00:00:00.000000062,512,"Undergraduate, Fresher",-20005.0,False,203886992,2
2,HR Operations Internship,DesiDap,Unknown,https://unstop.com/internships/hr-operations-i...,2024-10-09,,open,2,1970-01-01 00:00:00.000000062,1551,"Undergraduate, Fresher",-20005.0,False,203886992,2
3,Social Media Marketing Internship,DesiDap,Unknown,https://unstop.com/internships/social-media-ma...,2024-10-09,,open,13,1970-01-01 00:00:00.000000062,1838,"Fresher, Engineering Students, MBA Students, U...",-20005.0,False,203886992,6
4,Sales and Marketing Internship,Underlinen Fashion House,Unknown,https://unstop.com/internships/sales-and-marke...,2024-10-09,,open,1,1970-01-01 00:00:00.000000029,1728,Fresher,-20005.0,False,203886992,1


# Save the Cleaned Data
Save the cleaned and preprocessed data to a new CSV file.

In [19]:
# Save the Cleaned Data

# Save the cleaned and preprocessed DataFrame to a new CSV file
df.to_csv('cleaned_internship.csv', index=False)