# Exploratory Analysis and Regressions


## Parte 1


### 1.0 Pre-processing


In [1]:
'''
Importing the necessary libraries for the project
'''
import pandas as pd  # for data manipulations
import numpy as np  # for numerical computations
import matplotlib.pyplot as plt  # for data visualization
import seaborn as sns  # for data visualization
import warnings as wr  # for ignoring warnings
wr.filterwarnings('ignore')

In [6]:
'''
Loading and reading the dataset
'''
df = pd.read_csv('./baseball_reference_2016_scrape.csv')
df.head()

Unnamed: 0,attendance,away_team,away_team_errors,away_team_hits,away_team_runs,boxscore_url,date,field_type,game_duration,game_type,home_team,home_team_errors,home_team_hits,home_team_runs,other_info_string,start_time,venue
0,"40,030']",New York Mets,1,7,3,https://www.baseball-reference.com/boxes/KCA/K...,"Sunday, April 3, 2016",,: 3:13,"Night Game, on grass",Kansas City Royals,0,9,4,"<!-- \n <div class=""section_content"" id=""d...",Start Time: 7:38 p.m. Local,: Kauffman Stadium
1,"21,621']",Philadelphia Phillies,0,5,2,https://www.baseball-reference.com/boxes/CIN/C...,"Wednesday, April 6, 2016",,: 2:23,"Night Game, on grass",Cincinnati Reds,0,8,3,"<!-- \n <div class=""section_content"" id=""d...",Start Time: 7:11 p.m. Local,: Great American Ball Park
2,"12,622']",Minnesota Twins,0,5,2,https://www.baseball-reference.com/boxes/BAL/B...,"Wednesday, April 6, 2016",,: 3:11,"Night Game, on grass",Baltimore Orioles,0,9,4,"<!-- \n <div class=""section_content"" id=""d...",Start Time: 7:07 p.m. Local,: Oriole Park at Camden Yards
3,"18,531']",Washington Nationals,0,8,3,https://www.baseball-reference.com/boxes/ATL/A...,"Wednesday, April 6, 2016",,: 2:53,"Night Game, on grass",Atlanta Braves,1,8,1,"<!-- \n <div class=""section_content"" id=""d...",Start Time: 7:10 p.m. Local,: Turner Field
4,"18,572']",Colorado Rockies,1,8,4,https://www.baseball-reference.com/boxes/ARI/A...,"Wednesday, April 6, 2016",,: 2:39,"Day Game, on grass",Arizona Diamondbacks,0,8,3,"<!-- \n <div class=""section_content"" id=""d...",Start Time: 12:40 p.m. Local,: Chase Field


In [8]:
'''
Manual and basic data analysis
'''

df.shape

(2463, 17)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2463 entries, 0 to 2462
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   attendance         2463 non-null   object 
 1   away_team          2463 non-null   object 
 2   away_team_errors   2463 non-null   int64  
 3   away_team_hits     2463 non-null   int64  
 4   away_team_runs     2463 non-null   int64  
 5   boxscore_url       2463 non-null   object 
 6   date               2463 non-null   object 
 7   field_type         0 non-null      float64
 8   game_duration      2463 non-null   object 
 9   game_type          2460 non-null   object 
 10  home_team          2463 non-null   object 
 11  home_team_errors   2463 non-null   int64  
 12  home_team_hits     2463 non-null   int64  
 13  home_team_runs     2463 non-null   int64  
 14  other_info_string  2463 non-null   object 
 15  start_time         2463 non-null   object 
 16  venue              2463 

In [18]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
away_team_errors,2463.0,0.580593,0.793391,0.0,0.0,0.0,1.0,5.0
away_team_hits,2463.0,8.764515,3.511581,1.0,6.0,8.0,11.0,22.0
away_team_runs,2463.0,4.413723,3.104556,0.0,2.0,4.0,6.0,21.0
field_type,0.0,,,,,,,
home_team_errors,2463.0,0.585871,0.805542,0.0,0.0,0.0,1.0,5.0
home_team_hits,2463.0,8.611855,3.436965,0.0,6.0,8.0,11.0,22.0
home_team_runs,2463.0,4.519691,3.111572,0.0,2.0,4.0,6.0,17.0


In [11]:
df.columns.to_list()  # to get the list of columns

['attendance',
 'away_team',
 'away_team_errors',
 'away_team_hits',
 'away_team_runs',
 'boxscore_url',
 'date',
 'field_type',
 'game_duration',
 'game_type',
 'home_team',
 'home_team_errors',
 'home_team_hits',
 'home_team_runs',
 'other_info_string',
 'start_time',
 'venue']

In [12]:
df.isnull().sum()  # to check for missing values

attendance              0
away_team               0
away_team_errors        0
away_team_hits          0
away_team_runs          0
boxscore_url            0
date                    0
field_type           2463
game_duration           0
game_type               3
home_team               0
home_team_errors        0
home_team_hits          0
home_team_runs          0
other_info_string       0
start_time              0
venue                   0
dtype: int64

In [13]:
df.nunique()  # to check for unique values

attendance           2377
away_team              30
away_team_errors        6
away_team_hits         22
away_team_runs         20
boxscore_url         2463
date                  203
field_type              0
game_duration         169
game_type               4
home_team              30
home_team_errors        6
home_team_hits         23
home_team_runs         18
other_info_string    2463
start_time            207
venue                  34
dtype: int64

In [19]:
'''
Data reduction | Removing definitively unnecessary columns
'''

# dropping the field_type column, cause all the values are the same (NaN)
df.drop('field_type', axis=1, inplace=True)

In [None]:
# dropping the boxscore_url column, cause is not necessary for the analysis (Is a link to the boxscore of the game)
df.drop('boxscore_url', axis=1, inplace=True)

In [22]:
'''
Data cleaning/wrangling 

- Fixing data with formatting issues
- Embedded data
'''

# Formatting issue with 'attendance':

# its need to be a number int but instead is a string '43,332'],
# this means have to remove the comma and the bracket and cast it to int
# also some values have a single quote, so it needs to be removed too
# finnally, there are some wrong values like pure names, so they need to be removed

# Use pandas to_numeric function and convert invalid parsing to NaN
df['attendance'] = pd.to_numeric(df['attendance'].str.replace(
    ',', '').str.replace(']', '').str.replace("'", ''), errors='coerce')

In [23]:
# Formatting issue with 'date':

# its need to be a date but instead is detected as an object.
# so it needs to be cast to datetime format
df['date'] = pd.to_datetime(df['date'])

In [None]:
# Formatting issue with 'game_duration':

# its a duration indicator in hours, but instead is detected as an object.
# so it needs to be cast to a float format.
df['game_duration'] = pd.to_numeric(df['game_duration'].str.replace(
    'h', '').str.replace('m', '').str.replace('s', ''), errors='coerce')