In [142]:
import pandas as pd
import os

In [143]:
def create_df(path, header_row=0):
    '''
        path: the path to the folder containing the files
        header: the row that should be the header. default at 0
    '''
    files = os.listdir(path)
    li_dfs = []
    for f in files:
        if f.endswith('.csv'):
            file_path = path + f
            sing_qb_df = pd.read_csv(file_path, header=header_row)
            sing_qb_df.head(1)
            li_dfs.append(sing_qb_df)

    final_df = pd.concat(li_dfs, axis=0, ignore_index=True)
    return final_df

In [144]:
def clean_col_names(col_df):
    """
        Removing unecessary columns from the dataframe and cleaning the names of the ones we want to keep.
    """
    unwanted_cols = ['Rk', 'G#', 'Week', 'Tm', 'Unnamed: 7', 'GS', r"AY/A", 'Tgt', r'TD.2', 'Rec', r'Y/R', r'Ctch%', r'Y/Tgt', r'TD.3', 'Pts', 'FL', 'FF','FR', r'Num.1', r'Pct.1', r'Num.2', r'Pct.2', r'Yds.4', r'TD.4', '2PM', r"Yds.3"]
    col_df.drop(columns = unwanted_cols, inplace =True)
    col_df.rename(columns = {"Cmp%": "Cmp_perc", "Rate": "QBR", "Att": "Pass_att", "Yds": "Pass_yds", "TD": "Pass_td", "Sk": "Sacks",
                         r"Y/A": "Pass_yds_attmpt", r"Yds.1": "Sack_yds", r"Att.1": "Rush_att", r"Yds.2": "Rush_yds", r"Y/A.1": "Rush_yds_attmpt", r"TD.1": "Rush_td",
                        "Num": "Play_num", "Pct": "Play_pct"},
             inplace = True)


In [145]:
df = create_df('./csv/')

In [146]:
df.head()

Unnamed: 0,Name,Rk,Date,G#,Week,Age,Tm,Unnamed: 7,Opp,Result,GS,Cmp,Att,Cmp%,Yds,TD,Int,Rate,Sk,Yds.1,Y/A,AY/A,Att.1,Yds.2,Y/A.1,TD.1,TD.2,Pts,Fmb,FL,FF,FR,Yds.3,TD.3,Num,Pct,Num.1,Pct.1,Num.2,Pct.2,Tgt,Rec,Y/R,Ctch%,Y/Tgt,Yds.4,TD.4,2PM
0,T.Brady,1,9/13/20,1,1,43.041,TAM,@,NOR,L 23-34,*,23.0,36.0,63.89,239.0,2.0,2.0,78.4,3.0,15.0,6.64,5.25,3.0,9.0,3.0,1.0,1.0,6.0,1.0,0.0,0.0,1.0,0.0,0.0,70.0,100%,0.0,0%,0.0,0%,,,,,,,,
1,T.Brady,2,9/20/20,2,2,43.048,TAM,,CAR,W 31-17,*,23.0,35.0,65.71,217.0,1.0,1.0,80.3,0.0,0.0,6.2,5.49,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,-3.0,0.0,61.0,100%,0.0,0%,0.0,0%,,,,,,,,
2,T.Brady,3,9/27/20,3,3,43.055,TAM,@,DEN,W 28-10,*,25.0,38.0,65.79,297.0,3.0,0.0,115.8,2.0,12.0,7.82,9.39,5.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,68.0,100%,0.0,0%,0.0,0%,,,,,,,,
3,T.Brady,4,10/4/20,4,4,43.062,TAM,,LAC,W 38-31,*,30.0,46.0,65.22,369.0,5.0,1.0,117.0,0.0,0.0,8.02,9.22,3.0,-3.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,76.0,100%,0.0,0%,0.0,0%,,,,,,,,
4,T.Brady,5,10/8/20,5,5,43.066,TAM,@,CHI,L 19-20,*,25.0,41.0,60.98,253.0,1.0,0.0,86.7,3.0,20.0,6.17,6.66,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-2.0,0.0,71.0,100%,0.0,0%,0.0,0%,,,,,,,,


In [147]:
clean_col_names(df)
df.head()

Unnamed: 0,Name,Date,Age,Opp,Result,Cmp,Pass_att,Cmp_perc,Pass_yds,Pass_td,Int,QBR,Sacks,Sack_yds,Pass_yds_attmpt,Rush_att,Rush_yds,Rush_yds_attmpt,Rush_td,Fmb,Play_num,Play_pct
0,T.Brady,9/13/20,43.041,NOR,L 23-34,23.0,36.0,63.89,239.0,2.0,2.0,78.4,3.0,15.0,6.64,3.0,9.0,3.0,1.0,1.0,70.0,100%
1,T.Brady,9/20/20,43.048,CAR,W 31-17,23.0,35.0,65.71,217.0,1.0,1.0,80.3,0.0,0.0,6.2,1.0,0.0,0.0,0.0,1.0,61.0,100%
2,T.Brady,9/27/20,43.055,DEN,W 28-10,25.0,38.0,65.79,297.0,3.0,0.0,115.8,2.0,12.0,7.82,5.0,0.0,0.0,0.0,1.0,68.0,100%
3,T.Brady,10/4/20,43.062,LAC,W 38-31,30.0,46.0,65.22,369.0,5.0,1.0,117.0,0.0,0.0,8.02,3.0,-3.0,-1.0,0.0,0.0,76.0,100%
4,T.Brady,10/8/20,43.066,CHI,L 19-20,25.0,41.0,60.98,253.0,1.0,0.0,86.7,3.0,20.0,6.17,3.0,0.0,0.0,0.0,1.0,71.0,100%


In [148]:
def clean_cols():
    """
    Strip month from date
    Convert this to date time column
    Create Win or loss col
    from this same column get the offense score
    
    """
