# Import Required Libraries
Import necessary libraries such as pandas, numpy, and matplotlib.

In [2]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Display settings for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Load the Dataset
Load the raw data file into a pandas DataFrame.

In [3]:
# Load the Dataset
df = pd.read_csv('../scraped_hackathons.csv')
df.head()

Unnamed: 0,Title,Organisations,Link,Uploaded On,Opportunity Type,Status,Applied,Application Deadline,Impressions,Eligibility,Category,Region
0,Bid By Bit,K.J Somaiya College of Engineering,https://unstop.com/hackathons/bid-by-bit-kj-so...,2024-10-08T00:00:00+05:30,,open,16,2.0,1994,"Strategy, Coding Challenge, Hackathon, College...","Strategy, Coding Challenge, Hackathon, College...",offline
1,CraftNCode (State level for UP and Maharshtra),International Institute of Information Technol...,https://unstop.com/hackathons/craftncode-state...,2024-10-08T00:00:00+05:30,,open,0,9.0,6575,"Coding Challenge, Hackathon, Engineering Stude...","Coding Challenge, Hackathon, Engineering Stude...",offline
2,Datamatics Hackathon,"Indian Institute of Technology (IIT), Bombay",https://unstop.com/hackathons/datamatics-hacka...,2024-09-04T00:00:00+05:30,,open,63,21.0,6668,"Engineering Students, MBA Students, Undergradu...","Engineering Students, MBA Students, Undergradu...",online
3,SARCathon 2024 - Hackathon,"Indian Institute of Technology (IIT), Bombay",https://unstop.com/hackathons/sarcathon-2024-h...,2024-10-08T00:00:00+05:30,,open,39,2.0,5839,"All, Coding Challenge, Hackathon","All, Coding Challenge, Hackathon",online
4,Hackathon at Engineer NITK 2024,National Institute of Technology Karnataka (NI...,https://unstop.com/hackathons/hackathon-at-eng...,2024-10-08T18:00:00+05:30,,open,64,5.0,7897,"Hackathon, Coding Challenge, College Festival,...","Hackathon, Coding Challenge, College Festival,...",offline


# Explore the Dataset
Perform initial exploration of the dataset, including checking the shape, data types, and summary statistics.

In [4]:
# Explore the Dataset

# Check the shape of the dataset
df.shape

# Check the data types of each column
df.dtypes

# Get summary statistics of the dataset
df.describe(include='all')

# Check for missing values in the dataset
df.isnull().sum()

# Display the first few rows of the dataset
df.head()

Unnamed: 0,Title,Organisations,Link,Uploaded On,Opportunity Type,Status,Applied,Application Deadline,Impressions,Eligibility,Category,Region
0,Bid By Bit,K.J Somaiya College of Engineering,https://unstop.com/hackathons/bid-by-bit-kj-so...,2024-10-08T00:00:00+05:30,,open,16,2.0,1994,"Strategy, Coding Challenge, Hackathon, College...","Strategy, Coding Challenge, Hackathon, College...",offline
1,CraftNCode (State level for UP and Maharshtra),International Institute of Information Technol...,https://unstop.com/hackathons/craftncode-state...,2024-10-08T00:00:00+05:30,,open,0,9.0,6575,"Coding Challenge, Hackathon, Engineering Stude...","Coding Challenge, Hackathon, Engineering Stude...",offline
2,Datamatics Hackathon,"Indian Institute of Technology (IIT), Bombay",https://unstop.com/hackathons/datamatics-hacka...,2024-09-04T00:00:00+05:30,,open,63,21.0,6668,"Engineering Students, MBA Students, Undergradu...","Engineering Students, MBA Students, Undergradu...",online
3,SARCathon 2024 - Hackathon,"Indian Institute of Technology (IIT), Bombay",https://unstop.com/hackathons/sarcathon-2024-h...,2024-10-08T00:00:00+05:30,,open,39,2.0,5839,"All, Coding Challenge, Hackathon","All, Coding Challenge, Hackathon",online
4,Hackathon at Engineer NITK 2024,National Institute of Technology Karnataka (NI...,https://unstop.com/hackathons/hackathon-at-eng...,2024-10-08T18:00:00+05:30,,open,64,5.0,7897,"Hackathon, Coding Challenge, College Festival,...","Hackathon, Coding Challenge, College Festival,...",offline


# Handle Missing Values
Identify and handle missing values in the dataset.

In [5]:
# Handle Missing Values

# Identify columns with missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values

# Handle missing values by filling with appropriate values or dropping
# For this example, we'll fill missing values with a placeholder or drop them if necessary

# Fill missing values in 'Applied' column with 0 (assuming no applications if missing)
df['Applied'].fillna(0, inplace=True)

# Fill missing values in 'Application Deadline' with a placeholder date
df['Application Deadline'].fillna('2099-12-31', inplace=True)

# Drop rows where 'Link' is missing as it's crucial for identifying the hackathon
df.dropna(subset=['Link'], inplace=True)

# Verify that there are no more missing values
df.isnull().sum()

Title                     0
Organisations             2
Link                      0
Uploaded On               0
Opportunity Type        132
Status                    0
Applied                   0
Application Deadline      0
Impressions               0
Eligibility               0
Category                  0
Region                    0
dtype: int64

# Convert Data Types
Convert data types of columns where necessary, such as converting date columns to datetime objects.

In [6]:
# Convert Data Types

# Convert 'Uploaded On' and 'Application Deadline' columns to datetime objects
df['Uploaded On'] = pd.to_datetime(df['Uploaded On'])
df['Application Deadline'] = pd.to_datetime(df['Application Deadline'])

# Verify the data types after conversion
df.dtypes

Title                                                  object
Organisations                                          object
Link                                                   object
Uploaded On             datetime64[ns, pytz.FixedOffset(330)]
Opportunity Type                                       object
Status                                                 object
Applied                                                 int64
Application Deadline                           datetime64[ns]
Impressions                                             int64
Eligibility                                            object
Category                                               object
Region                                                 object
dtype: object

# Feature Engineering
Create new features or modify existing ones to improve the dataset for analysis.

In [7]:
# Feature Engineering

# Convert 'Uploaded On' and 'Application Deadline' columns to timezone-naive datetime objects
df['Uploaded On'] = pd.to_datetime(df['Uploaded On']).dt.tz_localize(None)
df['Application Deadline'] = pd.to_datetime(df['Application Deadline']).dt.tz_localize(None)

# Create a new feature 'Days Until Deadline' by calculating the difference between 'Application Deadline' and 'Uploaded On'
df['Days Until Deadline'] = (df['Application Deadline'] - df['Uploaded On']).dt.days

# Create a new feature 'Is Online' to indicate if the hackathon is online or offline
df['Is Online'] = df['Region'].apply(lambda x: 1 if x == 'online' else 0)

# Create a new feature 'Total Impressions' by summing up 'Impressions' and 'Applied'
df['Total Impressions'] = df['Impressions'] + df['Applied']

# Create a new feature 'Application Status' to indicate if the application is recent or not
df['Application Status'] = df['Status'].apply(lambda x: 1 if x == 'recent' else 0)

# Display the first few rows to verify the new features
df.head()

Unnamed: 0,Title,Organisations,Link,Uploaded On,Opportunity Type,Status,Applied,Application Deadline,Impressions,Eligibility,Category,Region,Days Until Deadline,Is Online,Total Impressions,Application Status
0,Bid By Bit,K.J Somaiya College of Engineering,https://unstop.com/hackathons/bid-by-bit-kj-so...,2024-10-08 00:00:00,,open,16,1970-01-01 00:00:00.000000002,1994,"Strategy, Coding Challenge, Hackathon, College...","Strategy, Coding Challenge, Hackathon, College...",offline,-20004,0,2010,0
1,CraftNCode (State level for UP and Maharshtra),International Institute of Information Technol...,https://unstop.com/hackathons/craftncode-state...,2024-10-08 00:00:00,,open,0,1970-01-01 00:00:00.000000009,6575,"Coding Challenge, Hackathon, Engineering Stude...","Coding Challenge, Hackathon, Engineering Stude...",offline,-20004,0,6575,0
2,Datamatics Hackathon,"Indian Institute of Technology (IIT), Bombay",https://unstop.com/hackathons/datamatics-hacka...,2024-09-04 00:00:00,,open,63,1970-01-01 00:00:00.000000021,6668,"Engineering Students, MBA Students, Undergradu...","Engineering Students, MBA Students, Undergradu...",online,-19970,1,6731,0
3,SARCathon 2024 - Hackathon,"Indian Institute of Technology (IIT), Bombay",https://unstop.com/hackathons/sarcathon-2024-h...,2024-10-08 00:00:00,,open,39,1970-01-01 00:00:00.000000002,5839,"All, Coding Challenge, Hackathon","All, Coding Challenge, Hackathon",online,-20004,1,5878,0
4,Hackathon at Engineer NITK 2024,National Institute of Technology Karnataka (NI...,https://unstop.com/hackathons/hackathon-at-eng...,2024-10-08 18:00:00,,open,64,1970-01-01 00:00:00.000000005,7897,"Hackathon, Coding Challenge, College Festival,...","Hackathon, Coding Challenge, College Festival,...",offline,-20005,0,7961,0


# Save the Cleaned Dataset
Save the cleaned and preprocessed dataset to a new CSV file.

In [8]:
# Save the Cleaned Dataset

# Save the cleaned and preprocessed dataset to a new CSV file
df.to_csv('cleaned_hackathons.csv', index=False)