In [1]:
import pandas as pd

In [2]:
file_path='all_seasons.csv'
data=pd.read_csv(file_path)

In [4]:
print("Initial Dataset Information:")
print(data.info())

Initial Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12844 entries, 0 to 12843
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         12844 non-null  int64  
 1   player_name        12844 non-null  object 
 2   team_abbreviation  12844 non-null  object 
 3   age                12844 non-null  float64
 4   player_height      12844 non-null  float64
 5   player_weight      12844 non-null  float64
 6   college            10990 non-null  object 
 7   country            12844 non-null  object 
 8   draft_year         12844 non-null  object 
 9   draft_round        12844 non-null  object 
 10  draft_number       12844 non-null  object 
 11  gp                 12844 non-null  int64  
 12  pts                12844 non-null  float64
 13  reb                12844 non-null  float64
 14  ast                12844 non-null  float64
 15  net_rating         12844 non-null  float6

In [5]:
print("\nMissing value:\n",data.isnull().sum())


Missing value:
 Unnamed: 0              0
player_name             0
team_abbreviation       0
age                     0
player_height           0
player_weight           0
college              1854
country                 0
draft_year              0
draft_round             0
draft_number            0
gp                      0
pts                     0
reb                     0
ast                     0
net_rating              0
oreb_pct                0
dreb_pct                0
usg_pct                 0
ts_pct                  0
ast_pct                 0
season                  0
dtype: int64


In [6]:
print("\nDuplicate Rows\n:",data.duplicated().sum())


Duplicate Rows
: 0


In [7]:
print("\nSample Data:\n",data.head())


Sample Data:
    Unnamed: 0       player_name team_abbreviation   age  player_height  \
0           0  Randy Livingston               HOU  22.0         193.04   
1           1  Gaylon Nickerson               WAS  28.0         190.50   
2           2      George Lynch               VAN  26.0         203.20   
3           3    George McCloud               LAL  30.0         203.20   
4           4      George Zidek               DEN  23.0         213.36   

   player_weight                college country draft_year draft_round  ...  \
0      94.800728        Louisiana State     USA       1996           2  ...   
1      86.182480  Northwestern Oklahoma     USA       1994           2  ...   
2     103.418976         North Carolina     USA       1993           1  ...   
3     102.058200          Florida State     USA       1989           1  ...   
4     119.748288                   UCLA     USA       1995           1  ...   

    pts  reb  ast  net_rating  oreb_pct  dreb_pct  usg_pct  ts_pc

## There is no Unnamed column and no missing values.

In [8]:
if 'Unnamed:0' in data.columns:
    data.drop(columns=['Unnamed:0'], inplace=True)

## Convertting the height from cm to meters for better readability

In [10]:
data['player_height'] = data['player_height']/100

# Validate numeric ranges to check for anomalies

In [11]:
invalid_heights=data[data['player_height']>2.5]
invalid_weights = data[data['player_weight']>200]
invalid_ages= data[data['age']<18]

# Output anomalies if found 

In [13]:
print("\nInvalid Heights:\n", invalid_heights)
print("\nInvalid Weights:\n", invalid_weights)
print("\nInvalid Age:\n", invalid_ages)


Invalid Heights:
 Empty DataFrame
Columns: [Unnamed: 0, player_name, team_abbreviation, age, player_height, player_weight, college, country, draft_year, draft_round, draft_number, gp, pts, reb, ast, net_rating, oreb_pct, dreb_pct, usg_pct, ts_pct, ast_pct, season]
Index: []

[0 rows x 22 columns]

Invalid Weights:
 Empty DataFrame
Columns: [Unnamed: 0, player_name, team_abbreviation, age, player_height, player_weight, college, country, draft_year, draft_round, draft_number, gp, pts, reb, ast, net_rating, oreb_pct, dreb_pct, usg_pct, ts_pct, ast_pct, season]
Index: []

[0 rows x 22 columns]

Invalid Age:
 Empty DataFrame
Columns: [Unnamed: 0, player_name, team_abbreviation, age, player_height, player_weight, college, country, draft_year, draft_round, draft_number, gp, pts, reb, ast, net_rating, oreb_pct, dreb_pct, usg_pct, ts_pct, ast_pct, season]
Index: []

[0 rows x 22 columns]


## Display the Cleaned data

In [15]:
print("\nCleaned Data Sample:\n", data.head())


Cleaned Data Sample:
    Unnamed: 0       player_name team_abbreviation   age  player_height  \
0           0  Randy Livingston               HOU  22.0         1.9304   
1           1  Gaylon Nickerson               WAS  28.0         1.9050   
2           2      George Lynch               VAN  26.0         2.0320   
3           3    George McCloud               LAL  30.0         2.0320   
4           4      George Zidek               DEN  23.0         2.1336   

   player_weight                college country draft_year draft_round  ...  \
0      94.800728        Louisiana State     USA       1996           2  ...   
1      86.182480  Northwestern Oklahoma     USA       1994           2  ...   
2     103.418976         North Carolina     USA       1993           1  ...   
3     102.058200          Florida State     USA       1989           1  ...   
4     119.748288                   UCLA     USA       1995           1  ...   

    pts  reb  ast  net_rating  oreb_pct  dreb_pct  usg_pc

## Save the cleaned dataset

In [25]:
cleaned_file_path='all_seasons_cleaned.csv'
data.to_csv(cleaned_file_path, index=False)
print("\nCleaned dataset saved to:",cleaned_file_path)


Cleaned dataset saved to: all_seasons_cleaned.csv
