In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd

# Base path to the files in Google Drive
base_path = '/content/drive/My Drive/bigdata/project/data/raw/'

# File paths
deliveries_path = base_path + 'deliveries.csv'
matches_path = base_path + 'matches.csv'

# Load datasets
deliveries_df = pd.read_csv(deliveries_path)
matches_df = pd.read_csv(matches_path)

# Rename 'id' to 'match_id' in matches for consistency
matches_df = matches_df.rename(columns={'id': 'match_id'})

# Merge the two datasets on match_id
merged_df = pd.merge(deliveries_df, matches_df, on='match_id', how='left')

# Select relevant columns for analysis
merged_df_cleaned = merged_df[[
    'match_id', 'season', 'city', 'date', 'venue',
    'batting_team', 'bowling_team', 'batter', 'bowler', 'non_striker',
    'batsman_runs', 'extra_runs', 'total_runs', 'is_wicket', 'player_dismissed',
    'dismissal_kind', 'toss_winner', 'toss_decision', 'winner', 'result',
    'result_margin', 'super_over'
]]

# Fill missing values for critical fields
merged_df_cleaned['season'] = merged_df_cleaned['season'].fillna('Unknown')
merged_df_cleaned['winner'] = merged_df_cleaned['winner'].fillna('No Result')
merged_df_cleaned['result'] = merged_df_cleaned['result'].fillna('normal')
merged_df_cleaned['result_margin'] = merged_df_cleaned['result_margin'].fillna(0)
merged_df_cleaned['super_over'] = merged_df_cleaned['super_over'].fillna('N')

# Show final shape and preview
print("Cleaned merged data shape:", merged_df_cleaned.shape)
display(merged_df_cleaned.head())



Cleaned merged data shape: (260920, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_cleaned['season'] = merged_df_cleaned['season'].fillna('Unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_cleaned['winner'] = merged_df_cleaned['winner'].fillna('No Result')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_cleaned['result'] = merged_df_cleaned['re

Unnamed: 0,match_id,season,city,date,venue,batting_team,bowling_team,batter,bowler,non_striker,...,total_runs,is_wicket,player_dismissed,dismissal_kind,toss_winner,toss_decision,winner,result,result_margin,super_over
0,335982,2007/08,Bangalore,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,BB McCullum,...,1,0,,,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,N
1,335982,2007/08,Bangalore,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,SC Ganguly,...,0,0,,,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,N
2,335982,2007/08,Bangalore,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,SC Ganguly,...,1,0,,,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,N
3,335982,2007/08,Bangalore,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,SC Ganguly,...,0,0,,,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,N
4,335982,2007/08,Bangalore,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,SC Ganguly,...,0,0,,,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,N


In [7]:
import os

# Define base and save paths
cleaned_dir = '/content/drive/My Drive/bigdata/project/data/cleaned'
save_path = f'{cleaned_dir}/ipl_merged_cleaned.csv'

# Create directory if it doesn't exist
os.makedirs(cleaned_dir, exist_ok=True)

# Save the cleaned DataFrame
merged_df_cleaned.to_csv(save_path, index=False)

print(f"✅ Cleaned IPL data saved at:\n{save_path}")


✅ Cleaned IPL data saved at:
/content/drive/My Drive/bigdata/project/data/cleaned/ipl_merged_cleaned.csv
