In [1]:
import pandas as pd
import numpy as np

def adjust_main(df):

    rows_to_drop = list()

    # Players name standardization
    for i in df.index:
        if pd.isna(df.at[i,'Player']):
            rows_to_drop.append(i)  # avoiding empty rows
        else:
            df.at[i, 'Player'] = df.at[i, 'Player'].split('*')[0]   # removing the * mark

    # Some null fields should be 0.00 (e.g. 3pt shooting)
    df = df.fillna(0.00)

    # Players that moved between franchises have the TOT (total) line
    # Get which players are those
    duplicated = list(df.loc[df['Tm'] == 'TOT',:].loc[:,'Player'])

    # Filling the arrays with the duplicated rows
    for i in df.index:
        if df.at[i,'Player'] in duplicated and df.at[i,'Tm'] != 'TOT':
            rows_to_drop.append(i)

    # Dropping the duplicated rows
    df = df.drop(rows_to_drop)

    # Dropping the Franchise column that we don't need anymore
#     df = df.drop(columns=['Tm'])

    # Returning the standardized DataFrame
    return df  

def merging_df(df1,df2, keys):

    df1 = df1.join(df2.set_index(keys), on=keys)

#     df[['Status']] = df[['Status']].fillna(value='OOR')

    df1.to_csv(f'./data/{season}_std.csv',index=False)
    
    return df1
    
def adjust_mvp(df):
    
    # Players name standardization
    df['Status'] = 'Candidate'
    df.at[0,'Status'] = 'MVP'
    
    rows_to_drop = []
    flag = False
    
    for i in df.index:
        if pd.isna(df.at[i,'Player']):
            flag = True
        
        if flag is True:
            rows_to_drop.append(i)
    
    df = df.drop(rows_to_drop)

    df.to_csv(f'./data/mvp/{season}_mvp_std.csv', index=False)
     
    return df


In [2]:
for season in np.arange(1981,2021,1):
    df1 = pd.read_csv(f'./basketball_reference_dbs/{season}_totals.csv')
    # Removing empty column
    df1 = df1.drop(columns=['GS'])
    df1 = adjust_main(df1)
    
    # Removing empty columns
    col = []
    for i in range(28):
        if i != 18 and i != 23:
            col.append(i)
    df2 = pd.read_csv(f'./basketball_reference_dbs/{season}_advanced.csv',usecols=col) 
    df2 = adjust_main(df2)
    
    # Merging both DataFrames
    df1 = merging_df(df1,df2,['Player','Pos', 'Age','G', 'MP','Tm']) 
    
    # Adjusting the MVP DataFrame
    col = ['Player', 'First','Share']
    df3 = pd.read_csv(f'./basketball_reference_dbs/mvp/{season}_mvp.csv',usecols=col)
    df3 = adjust_mvp(df3)
    
    # Merging again
    df1 = merging_df(df1,df3,['Player'])
    
    df1[['Status']] = df1[['Status']].fillna(value='OOR')
    df1 = df1.fillna(0.0)
    df1[['Season']] = season
    
    data_types_dict = {'Age': 'int32', 'G': 'int32', 'MP': 'int32', 'FG': 'int32', 'FGA': 'int32', '3P': 'int32', '3PA': 'int32',
    '2P': 'int32', '2PA': 'int32', 'FT': 'int32', 'FTA': 'int32', 'ORB': 'int32', 'DRB': 'int32', 'TRB': 'int32', 'AST': 'int32', 
    'STL': 'int32', 'BLK': 'int32', 'TOV': 'int32', 'PF': 'int32', 'PTS': 'int32', 'First': 'int32', 'Season': 'object' }

    df1 = df1.astype(data_types_dict)
#     print(df1.dtypes)    
    
    df1.to_csv(f'./data/{season}_std.csv',index=False)



In [3]:
print(df1.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 529 entries, 0 to 676
Data columns (total 52 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  529 non-null    object 
 1   Pos     529 non-null    object 
 2   Age     529 non-null    int32  
 3   Tm      529 non-null    object 
 4   G       529 non-null    int32  
 5   MP      529 non-null    int32  
 6   FG      529 non-null    int32  
 7   FGA     529 non-null    int32  
 8   FG%     529 non-null    float64
 9   3P      529 non-null    int32  
 10  3PA     529 non-null    int32  
 11  3P%     529 non-null    float64
 12  2P      529 non-null    int32  
 13  2PA     529 non-null    int32  
 14  2P%     529 non-null    float64
 15  eFG%    529 non-null    float64
 16  FT      529 non-null    int32  
 17  FTA     529 non-null    int32  
 18  FT%     529 non-null    float64
 19  ORB     529 non-null    int32  
 20  DRB     529 non-null    int32  
 21  TRB     529 non-null    int32  
 22  AS