# Dependencies
* Installing `pyarrow` is needed for parquet support

In [None]:
! pip install pyarrow

In [66]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
import glob

# Reading Data
* Since our data is in parquet format, we need to loop over our data to have them in dataframes

    1. Extracting Table Names using `os.listdir`
    
    2. Creating a dictionary with keys for each table and appending table parquet files as a list for each key

    3. Reading each parquet file for each table and creating a dictionary for each dataframe with table names to concatenate them afterwards

    4. Concatenating each dataframe value and creating a dataframe with relevant name for the table and saving the dataframe as a csv file to be able to use later and share with the team

In [67]:
BASE_DIR = 'tennis_data_20231212/raw/'
folders = os.listdir(BASE_DIR)
folders

['raw_match_parquet',
 'raw_odds_parquet',
 'raw_point_by_point_parquet',
 'raw_statistics_parquet',
 'raw_tennis_power_parquet',
 'raw_votes_parquet']

## 1. Extracting Table Names using `os.listdir` and converting it to `set()` so we can have unique names

In [68]:
table_list = []
for folder in folders:
    files = os.listdir(BASE_DIR + folder)
    for file in files:
        name = file.split('.')[0]
        table_name = name.split('_')
        table_name = '_'.join(table_name[:-1])
        table_list.append(table_name)
        
table_set = set(table_list)
table_set

{'away_team',
 'away_team_score',
 'event',
 'home_team',
 'home_team_score',
 'odds',
 'pbp',
 'power',
 'round',
 'season',
 'statistics',
 'time',
 'tournament',
 'venue',
 'votes'}

## 2. Creating a dictionary with keys for each table and appending table parquet files as a list for each key

In [69]:
table_dict = {name: [] for name in list(table_set)}
    
for folder in folders:
    files = os.listdir(BASE_DIR + folder)
    for file in files:
        table_name = file.split('.')[0].split('_')
        table_name = '_'.join(table_name[:-1])
        table_dict[table_name].append(BASE_DIR + folder + '/' + file)



## 3. Reading each parquet file for each table and creating a dictionary for each dataframe with table names to concatenate them afterwards

In [72]:
dataframes_dict = {name: [] for name in list(table_set)}
for key in table_dict:
    for file in table_dict[key]:
        dataframes_dict[key].append(pd.read_parquet(file))

In [73]:
dataframes_dict['away_team']

[   match_id        name                    slug gender  user_count residence  \
 0  11673958  Tirante T.  tirante-thiago-agustin      M        1519  La Plata   
 
             birthplace  height  weight         plays turned_pro  \
 0  La Plata, Argentina    1.85      78  right-handed       None   
 
    current_prize  total_prize  player_id  current_rank name_code    country  \
 0         249335       430793     221515           123       TIR  Argentina   
 
                  full_name  
 0  Tirante, Thiago Agustin  ,
    match_id      name           slug gender  user_count residence birthplace  \
 0  11673962  Heide G.  heide-gustavo      M        1177      None  Sao Paulo   
 
    height weight         plays turned_pro  current_prize  total_prize  \
 0    1.88   None  right-handed       None          36929        87230   
 
    player_id  current_rank name_code country       full_name  
 0     302582           247       HEI  Brazil  Heide, Gustavo  ,
    match_id              name  

## 4. Concatenating each dataframe value and creating a dataframe with relevant name for the table and saving the dataframe as a csv file to be able to use later and share with the team

In [43]:
output_dir = 'tennis_csv/'
for key in dataframes_dict:
    dataframe_name = f'df_{key}'
    df = globals()[dataframe_name] = pd.concat(dataframes_dict[key], ignore_index=True)
    csv_file_name = f"{dataframe_name}.csv"
    file_path = os.path.join(output_dir, csv_file_name)
    df.to_csv(file_path, index=False)

  df = globals()[dataframe_name] = pd.concat(dataframes_dict[key], ignore_index=True)


# Loading Data

***STEPS***
1. Loading all the dataframes from csv file

2. Cleaning Dataframes and Understanding what they present

In [44]:
INPUT_DIR = 'tennis_csv/'

## 1. Loading all the dataframes from csv file

In [48]:
for file in os.listdir(INPUT_DIR):
    df_name = file.split('.')[0] + '_csv'
    globals()[df_name] = pd.read_csv(INPUT_DIR + file)

### 1. MatchTournamentInfo

1. `match_id`
    * **Numerical**
    * Showing ID of the match and seems like all the tables relate to each other with this ID
    
2. `tournament_id`
    * **Numerical**
    * Showing ID of the tournament for each match. 
    * Has duplicate values that differ with match ID

3. `tournament_name`
    * **Categorical**
    * This column shows tournament name and it is combination of city and country shown like this : `City, Country`
    * This column has **55 Unique Values**
4. `tournament_slug`
    * **Categorical**
    * This column shows tournament name and it is combination of city and country shown like this : `city-country`
    * This column has **55 Unique Values**

5. `tournament_unique_id`
    * **Numerical**
    * All the values of this column is missing and **This column will be droped in the Cleaning Process**
6. `tournament_category_name`
    * **Categorical**
    * This column shows the category of the tournament
    * Has **5 Unique Values**
        * Challenger
        * WTA
        * ATP
        * ITF Men
        * ITF Women

7. `tournament_category_slug`
    * **Categorical**
    * This column shows the category of the tournament in slug meaning they are lower case and have no space
    * Has **5 Unique Values**
        * challenger
        * wta
        * atp
        * itf-men
        * itf-women

8. `user_count`
    * **Numerical**
    * This column shows how many users participated in the tournament
    * Some of the information about this column:
        * Mean : 223.56
        * Min : 2
        * Max : 3627
        * std : 711.36
        * Q1 : 7
        * Q3 : 19
        

9. `ground_type`
    * **Categorical**
    * This column shows what kind of ground the match was played on
    * Has **6 Unique Values**
        * Red clay
        * Hardcourt outdoor
        * Hardcourt indoor
        * Carpet indoor
        * nan
        * Synthetic outdoor
    * **Has 17 Missing Values that will be dropped**


10. `tennis_points`
    * **Numerical**
    * This column shows how many points were scored in this tournament
    * This column has **600 Missing Values** - *Not decided what to do with it*
    * Some of the information about this column:
        * Mean : 406.097561
        * Min : 75
        * Max : 1000
        * std : 412.441871
        * Q1 : 100
        * Q3 : 1000

***All the following columns only have **One** Value and **None** is Missing***

11. `has_event_player_statistics`

12. `crowd_sourcing_enabled`

13. `has_performance_graph_feature`

14. `display_inverse_home_away_teams`

15. `priority`

16. `competition_type`


In [61]:
df_tournament_csv.ground_type.unique()

array(['Red clay', 'Hardcourt outdoor', 'Hardcourt indoor',
       'Carpet indoor', nan, 'Synthetic outdoor'], dtype=object)

In [74]:
df_tournament_csv.head()

Unnamed: 0,match_id,tournament_id,tournament_name,tournament_slug,tournament_unique_id,tournament_category_name,tournament_category_slug,user_count,ground_type,tennis_points,has_event_player_statistics,crowd_sourcing_enabled,has_performance_graph_feature,display_inverse_home_away_teams,priority,competition_type
0,11673958,121441,"Bogota, Colombia",bogota-colombia,,Challenger,challenger,125,Red clay,125.0,False,False,False,False,0,2
1,11673962,121441,"Bogota, Colombia",bogota-colombia,,Challenger,challenger,125,Red clay,125.0,False,False,False,False,0,2
2,11673963,121441,"Bogota, Colombia",bogota-colombia,,Challenger,challenger,125,Red clay,125.0,False,False,False,False,0,2
3,11673993,121451,"Braga, Portugal",braga-portugal,,Challenger,challenger,163,Red clay,75.0,False,False,False,False,0,2
4,11674000,121453,"Charleston, USA",charleston-usa,,Challenger,challenger,71,Hardcourt outdoor,75.0,False,False,False,False,0,2


In [89]:
df_home_team.head()

Unnamed: 0,match_id,name,slug,gender,user_count,residence,birthplace,height,weight,plays,turned_pro,current_prize,total_prize,player_id,current_rank,name_code,country,full_name
0,11673958,Barrios Vera T.,barrios-vera-tomas,M,2915,Santiago,"Chillan, Chile",1.91,83.0,right-handed,2014.0,265608,646273,132834,103,BAR,Chile,"Barrios Vera, Tomás"
1,11673962,Tirante T.,tirante-thiago-agustin,M,1519,La Plata,"La Plata, Argentina",1.85,78.0,right-handed,,249335,430793,221515,123,TIR,Argentina,"Tirante, Thiago Agustin"
2,11673963,Heide G.,heide-gustavo,M,1177,,Sao Paulo,1.88,,right-handed,,36929,87230,302582,247,HEI,Brazil,"Heide, Gustavo"
3,11673993,Ajduković D.,ajdukovic-duje,M,1293,"Zagreb, Croatia","Split, Croatia",1.88,75.0,right-handed,,63678,198866,207081,143,AJD,Croatia,"Ajduković, Duje"
4,11674000,Escobedo E.,escobedo-ernesto,M,680,"West Covina, CA, USA","Los Angeles, CA, USA",1.85,82.0,right-handed,2014.0,84375,1280430,80491,404,ESC,Mexico,"Escobedo, Ernesto"


In [88]:
df_event.head()

Unnamed: 0,match_id,first_to_serve,home_team_seed,away_team_seed,custom_id,winner_code,default_period_count,start_datetime,match_slug,final_result_only
0,11673958,,1,4,JgdbspFNb,2,3,1696094700,tirante-barrios-vera,False
1,11673962,,4,,pFNbsHbwc,1,3,1696176000,heide-tirante,False
2,11673963,,,,fTxsHbwc,1,3,1696087800,heide-jorda-sanchis,False
3,11673993,,,Q,axvsGQHb,2,3,1696154400,ajdukovic-roca-batalla,False
4,11674000,,Q,WC,QjHshlKb,2,3,1696086000,crawford-escobedo,False


In [82]:
df_tournament[df_tournament['tournament_id'] == 121453]

Unnamed: 0,match_id,tournament_id,tournament_name,tournament_slug,tournament_unique_id,tournament_category_name,tournament_category_slug,user_count,ground_type,tennis_points,has_event_player_statistics,crowd_sourcing_enabled,has_performance_graph_feature,display_inverse_home_away_teams,priority,competition_type
4,11674000,121453,"Charleston, USA",charleston-usa,,Challenger,challenger,71,Hardcourt outdoor,75,False,False,False,False,0,2
5,11674008,121453,"Charleston, USA",charleston-usa,,Challenger,challenger,71,Hardcourt outdoor,75,False,False,False,False,0,2
6,11674011,121453,"Charleston, USA",charleston-usa,,Challenger,challenger,71,Hardcourt outdoor,75,False,False,False,False,0,2


In [57]:
df_tournament_csv.columns

Index(['match_id', 'tournament_id', 'tournament_name', 'tournament_slug',
       'tournament_unique_id', 'tournament_category_name',
       'tournament_category_slug', 'user_count', 'ground_type',
       'tennis_points', 'has_event_player_statistics',
       'crowd_sourcing_enabled', 'has_performance_graph_feature',
       'display_inverse_home_away_teams', 'priority', 'competition_type'],
      dtype='object')

In [51]:
df_tournament_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 764 entries, 0 to 763
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   match_id                         764 non-null    int64  
 1   tournament_id                    764 non-null    int64  
 2   tournament_name                  764 non-null    object 
 3   tournament_slug                  764 non-null    object 
 4   tournament_unique_id             0 non-null      float64
 5   tournament_category_name         764 non-null    object 
 6   tournament_category_slug         764 non-null    object 
 7   user_count                       764 non-null    int64  
 8   ground_type                      747 non-null    object 
 9   tennis_points                    164 non-null    float64
 10  has_event_player_statistics      764 non-null    bool   
 11  crowd_sourcing_enabled           764 non-null    bool   
 12  has_performance_graph_

In [52]:
df_tournament_csv.describe()

Unnamed: 0,match_id,tournament_id,tournament_unique_id,user_count,tennis_points,priority,competition_type
count,764.0,764.0,0.0,764.0,164.0,764.0,764.0
mean,11699020.0,121710.625654,,223.560209,406.097561,0.0,2.0
std,5029.278,538.709546,,711.361346,412.441871,0.0,0.0
min,11673960.0,119387.0,,2.0,75.0,0.0,2.0
25%,11699010.0,121843.0,,7.0,100.0,0.0,2.0
50%,11699350.0,121856.0,,11.0,100.0,0.0,2.0
75%,11701510.0,121870.0,,19.0,1000.0,0.0,2.0
max,11703520.0,121898.0,,3627.0,1000.0,0.0,2.0


In [53]:
df_tournament_csv.isna().sum()

match_id                             0
tournament_id                        0
tournament_name                      0
tournament_slug                      0
tournament_unique_id               764
tournament_category_name             0
tournament_category_slug             0
user_count                           0
ground_type                         17
tennis_points                      600
has_event_player_statistics          0
crowd_sourcing_enabled               0
has_performance_graph_feature        0
display_inverse_home_away_teams      0
priority                             0
competition_type                     0
dtype: int64

In [55]:
df_tournament_csv.nunique()

match_id                           764
tournament_id                       56
tournament_name                     55
tournament_slug                     55
tournament_unique_id                 0
tournament_category_name             5
tournament_category_slug             5
user_count                          31
ground_type                          5
tennis_points                        6
has_event_player_statistics          1
crowd_sourcing_enabled               1
has_performance_graph_feature        1
display_inverse_home_away_teams      1
priority                             1
competition_type                     1
dtype: int64

In [63]:
df_home_team.head()

Unnamed: 0,match_id,name,slug,gender,user_count,residence,birthplace,height,weight,plays,turned_pro,current_prize,total_prize,player_id,current_rank,name_code,country,full_name
0,11673958,Barrios Vera T.,barrios-vera-tomas,M,2915,Santiago,"Chillan, Chile",1.91,83.0,right-handed,2014.0,265608,646273,132834,103,BAR,Chile,"Barrios Vera, Tomás"
1,11673962,Tirante T.,tirante-thiago-agustin,M,1519,La Plata,"La Plata, Argentina",1.85,78.0,right-handed,,249335,430793,221515,123,TIR,Argentina,"Tirante, Thiago Agustin"
2,11673963,Heide G.,heide-gustavo,M,1177,,Sao Paulo,1.88,,right-handed,,36929,87230,302582,247,HEI,Brazil,"Heide, Gustavo"
3,11673993,Ajduković D.,ajdukovic-duje,M,1293,"Zagreb, Croatia","Split, Croatia",1.88,75.0,right-handed,,63678,198866,207081,143,AJD,Croatia,"Ajduković, Duje"
4,11674000,Escobedo E.,escobedo-ernesto,M,680,"West Covina, CA, USA","Los Angeles, CA, USA",1.85,82.0,right-handed,2014.0,84375,1280430,80491,404,ESC,Mexico,"Escobedo, Ernesto"


In [65]:
df_away_team_score.head()

Unnamed: 0,match_id,current_score,display_score,period_1,period_2,period_3,period_4,period_5,period_1_tie_break,period_2_tie_break,period_3_tie_break,period_4_tie_break,period_5_tie_break,normal_time
0,11673958,2.0,2.0,6.0,6.0,,,,,,,,,
1,11673962,,,,,,,,,,,,,
2,11673963,0.0,0.0,6.0,4.0,,,,,,,,,
3,11673993,2.0,2.0,4.0,6.0,6.0,,,,,,,,
4,11674000,1.0,1.0,6.0,2.0,,,,,,,,,
