In [2]:
import pandas as pd
import glob

# Specify the folder where the 13 CSV files are located
data_folder = "WorldCup_Stats"

# Use glob to find all CSV files in the folder
csv_files = glob.glob(f"{data_folder}/*.csv")

# Load and concatenate the CSV files into a single DataFrame
dataframes = [pd.read_csv(file) for file in csv_files]  # List comprehension to read each file
crick_df = pd.concat(dataframes, ignore_index=True)

# Display basic information about the resulting DataFrame
crick_df.info()
crick_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528 entries, 0 to 527
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0.1     528 non-null    int64  
 1   Unnamed: 0       528 non-null    int64  
 2   date             364 non-null    object 
 3   venue            528 non-null    object 
 4   match_category   528 non-null    object 
 5   team_1           528 non-null    object 
 6   team_2           528 non-null    object 
 7   team_1_runs      518 non-null    float64
 8   team_1_wickets   518 non-null    float64
 9   team_2_runs      513 non-null    float64
 10  team_2_wickets   513 non-null    float64
 11  result           528 non-null    object 
 12  pom              510 non-null    object 
 13  best_batters     250 non-null    object 
 14  best_bowlers     250 non-null    object 
 15  commentary_line  83 non-null     object 
 16  world_cup_year   528 non-null    int64  
 17  host_country    

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,date,venue,match_category,team_1,team_2,team_1_runs,team_1_wickets,team_2_runs,team_2_wickets,result,pom,best_batters,best_bowlers,commentary_line,world_cup_year,host_country
0,0,11,,Nottingham,League-Match,PAK,SL,330.0,6.0,138.0,0.0,Pakistan won by 192 runs,Zaheer Abbas,,,,1975,England
1,1,5,,Leeds,League-Match,EAf,IND,120.0,0.0,123.0,0.0,India won by 10 wickets (with 181 balls remain...,Farokh Engineer,,,,1975,England
2,2,12,1975-06-18,Leeds,Semi-Final,ENG,AUS,93.0,0.0,94.0,6.0,Australia won by 4 wickets (with 188 balls rem...,Gary Gilmour,,,,1975,England
3,3,8,1975-06-14,Birmingham,League-Match,ENG,EAf,290.0,5.0,94.0,0.0,England won by 196 runs,John Snow,,,,1975,England
4,4,13,,The Oval,Semi-Final,NZ,WI,158.0,0.0,159.0,5.0,West Indies won by 5 wickets (with 119 balls r...,Alvin Kallicharran,,,,1975,England


In [3]:
# Display the general structure of the DataFrame
print("DataFrame Structure:")
crick_df.info()

# Display the first few rows to understand the data
print("\nFirst 5 Rows:")
print(crick_df.head())

# Display the columns
print("\nColumns:")
print(crick_df.columns)

# Check data types
print("\nData Types:")
print(crick_df.dtypes)

DataFrame Structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528 entries, 0 to 527
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0.1     528 non-null    int64  
 1   Unnamed: 0       528 non-null    int64  
 2   date             364 non-null    object 
 3   venue            528 non-null    object 
 4   match_category   528 non-null    object 
 5   team_1           528 non-null    object 
 6   team_2           528 non-null    object 
 7   team_1_runs      518 non-null    float64
 8   team_1_wickets   518 non-null    float64
 9   team_2_runs      513 non-null    float64
 10  team_2_wickets   513 non-null    float64
 11  result           528 non-null    object 
 12  pom              510 non-null    object 
 13  best_batters     250 non-null    object 
 14  best_bowlers     250 non-null    object 
 15  commentary_line  83 non-null     object 
 16  world_cup_year   528 non-null    int64  


In [4]:
# Check for duplicate records
duplicates = crick_df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Remove duplicates
if duplicates > 0:
    crick_df = crick_df.drop_duplicates()
    print(f"Duplicate rows removed. New shape: {crick_df.shape}")


Number of duplicate rows: 0


In [5]:
# Check for null values in each column
print("\nNull Values Per Column:")
print(crick_df.isnull().sum())

# Remove rows with null values
null_values = crick_df.isnull().any().sum()
if null_values > 0:
    crick_df = crick_df.dropna(how='all')
    print(f"\nRows with null values removed. New shape: {crick_df.shape}")
else:
    print("\nNo null values detected.")


Null Values Per Column:
Unnamed: 0.1         0
Unnamed: 0           0
date               164
venue                0
match_category       0
team_1               0
team_2               0
team_1_runs         10
team_1_wickets      10
team_2_runs         15
team_2_wickets      15
result               0
pom                 18
best_batters       278
best_bowlers       278
commentary_line    445
world_cup_year       0
host_country         0
dtype: int64

Rows with null values removed. New shape: (528, 18)


In [6]:
# Verify the cleaned DataFrame structure
print("\nCleaned DataFrame Structure:")
crick_df.info()


Cleaned DataFrame Structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528 entries, 0 to 527
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0.1     528 non-null    int64  
 1   Unnamed: 0       528 non-null    int64  
 2   date             364 non-null    object 
 3   venue            528 non-null    object 
 4   match_category   528 non-null    object 
 5   team_1           528 non-null    object 
 6   team_2           528 non-null    object 
 7   team_1_runs      518 non-null    float64
 8   team_1_wickets   518 non-null    float64
 9   team_2_runs      513 non-null    float64
 10  team_2_wickets   513 non-null    float64
 11  result           528 non-null    object 
 12  pom              510 non-null    object 
 13  best_batters     250 non-null    object 
 14  best_bowlers     250 non-null    object 
 15  commentary_line  83 non-null     object 
 16  world_cup_year   528 non-null   

In [7]:
# Check for missing values
print("\nMissing Values per Column:")
print(crick_df.isnull().sum())


Missing Values per Column:
Unnamed: 0.1         0
Unnamed: 0           0
date               164
venue                0
match_category       0
team_1               0
team_2               0
team_1_runs         10
team_1_wickets      10
team_2_runs         15
team_2_wickets      15
result               0
pom                 18
best_batters       278
best_bowlers       278
commentary_line    445
world_cup_year       0
host_country         0
dtype: int64


In [8]:
# Analyze columns with missing values
columns_with_missing = crick_df.columns[crick_df.isnull().any()]
print(f"\nColumns with missing values: {list(columns_with_missing)}")


Columns with missing values: ['date', 'team_1_runs', 'team_1_wickets', 'team_2_runs', 'team_2_wickets', 'pom', 'best_batters', 'best_bowlers', 'commentary_line']


In [10]:
crick_df = crick_df.dropna()
print("\nDropped rows with missing values.")


Dropped rows with missing values.
