In [1]:
# Load the required libraries
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
df = pd.read_csv('../data/FIFA 2018 Statistics.csv')

The `shape` function gives information on dataset size. It's equivalent to `dim()` in R

In [32]:
df.shape()

TypeError: 'tuple' object is not callable

The above error is saying, shape is an attribute, not a method. Just use df.shape (no parentheses).

In [3]:
df.shape

(128, 27)

In [4]:
df.ndim

2

In [5]:
# check the columns
df.columns

Index(['Date', 'Team', 'Opponent', 'Goal Scored', 'Ball Possession %',
       'Attempts', 'On-Target', 'Off-Target', 'Blocked', 'Corners', 'Offsides',
       'Free Kicks', 'Saves', 'Pass Accuracy %', 'Passes',
       'Distance Covered (Kms)', 'Fouls Committed', 'Yellow Card',
       'Yellow & Red', 'Red', 'Man of the Match', '1st Goal', 'Round', 'PSO',
       'Goals in PSO', 'Own goals', 'Own goal Time'],
      dtype='object')

In [6]:
print(df.dtypes)

Date                       object
Team                       object
Opponent                   object
Goal Scored                 int64
Ball Possession %           int64
Attempts                    int64
On-Target                   int64
Off-Target                  int64
Blocked                     int64
Corners                     int64
Offsides                    int64
Free Kicks                  int64
Saves                       int64
Pass Accuracy %             int64
Passes                      int64
Distance Covered (Kms)      int64
Fouls Committed             int64
Yellow Card                 int64
Yellow & Red                int64
Red                         int64
Man of the Match           object
1st Goal                  float64
Round                      object
PSO                        object
Goals in PSO                int64
Own goals                 float64
Own goal Time             float64
dtype: object


The `object` data types are actually categorical variables. 
They CAN be converted to `category` datatype. 
To change the datatype of a specific column, use the `.astype()` function. For example, to see the ‘Item Code’ column as a string, use: `data['Item Code'].astype(str)`

##### Note: Pandas treat string values as Object datatypes. See this [SO discussion](https://stackoverflow.com/questions/34881079/pandas-distinction-between-str-and-object-types)

In [7]:
# check the data type of variables
# In pandas info() is a rough equivalent to str() of R
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 27 columns):
Date                      128 non-null object
Team                      128 non-null object
Opponent                  128 non-null object
Goal Scored               128 non-null int64
Ball Possession %         128 non-null int64
Attempts                  128 non-null int64
On-Target                 128 non-null int64
Off-Target                128 non-null int64
Blocked                   128 non-null int64
Corners                   128 non-null int64
Offsides                  128 non-null int64
Free Kicks                128 non-null int64
Saves                     128 non-null int64
Pass Accuracy %           128 non-null int64
Passes                    128 non-null int64
Distance Covered (Kms)    128 non-null int64
Fouls Committed           128 non-null int64
Yellow Card               128 non-null int64
Yellow & Red              128 non-null int64
Red                       128 non-nul

In [11]:
# Show the categorical variables
df.select_dtypes('object').head()

Unnamed: 0,Date,Team,Opponent,Man of the Match,Round,PSO
0,14-06-2018,Russia,Saudi Arabia,Yes,Group Stage,No
1,14-06-2018,Saudi Arabia,Russia,No,Group Stage,No
2,15-06-2018,Egypt,Uruguay,No,Group Stage,No
3,15-06-2018,Uruguay,Egypt,Yes,Group Stage,No
4,15-06-2018,Morocco,Iran,No,Group Stage,No


In [13]:
# Show the continuous variables
df.select_dtypes('int64','float64').head()

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,Pass Accuracy %,Passes,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,Goals in PSO
0,5,40,13,7,3,3,6,3,11,0,78,306,118,22,0,0,0,0
1,0,60,6,0,3,3,2,1,25,2,86,511,105,10,0,0,0,0
2,0,43,8,3,3,2,0,1,7,3,78,395,112,12,2,0,0,0
3,1,57,14,4,6,4,5,1,13,3,86,589,111,6,0,0,0,0
4,0,64,13,3,6,4,5,0,14,2,86,433,101,22,1,0,0,0


See the [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/api.html) for initial data exploration tasks

In [8]:
# The summary() of R equivalent for pandas is describe()
df.describe()

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,...,Passes,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,1st Goal,Goals in PSO,Own goals,Own goal Time
count,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,...,128.0,128.0,128.0,128.0,128.0,128.0,94.0,128.0,12.0,12.0
mean,1.320312,49.992188,12.59375,3.914062,5.273438,3.359375,4.71875,1.34375,14.890625,2.726562,...,462.648438,106.664062,13.546875,1.695312,0.015625,0.015625,39.457447,0.203125,1.0,45.833333
std,1.156519,10.444074,5.245827,2.234403,2.409675,2.403195,2.446072,1.193404,4.724262,2.049447,...,151.186311,11.749537,4.619131,1.325454,0.124507,0.124507,24.496506,0.807049,0.0,29.978275
min,0.0,25.0,3.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,...,189.0,80.0,5.0,0.0,0.0,0.0,1.0,0.0,1.0,12.0
25%,0.0,42.0,9.0,2.0,4.0,1.75,3.0,0.0,11.0,1.0,...,351.0,101.0,10.0,1.0,0.0,0.0,18.25,0.0,1.0,21.75
50%,1.0,50.0,12.0,3.5,5.0,3.0,5.0,1.0,15.0,2.0,...,462.0,104.5,13.0,2.0,0.0,0.0,39.0,0.0,1.0,35.0
75%,2.0,58.0,15.0,5.0,7.0,4.0,6.0,2.0,18.0,4.0,...,555.25,109.0,16.0,2.0,0.0,0.0,54.75,0.0,1.0,75.75
max,6.0,75.0,26.0,12.0,11.0,10.0,11.0,5.0,26.0,9.0,...,1137.0,148.0,25.0,6.0,1.0,1.0,90.0,4.0,1.0,90.0


In [15]:
# show the Number of missing in each column
missing = pd.DataFrame(df.isnull().sum()).rename(columns = {0: 'total'})

# Create a percentage missing
missing['percent'] = missing['total'] / len(df)

missing.sort_values('percent', ascending = False).head(10)

Unnamed: 0,total,percent
Own goal Time,116,0.90625
Own goals,116,0.90625
1st Goal,34,0.265625
Passes,0,0.0
Goals in PSO,0,0.0
PSO,0,0.0
Round,0,0.0
Man of the Match,0,0.0
Red,0,0.0
Yellow & Red,0,0.0


In [27]:
# Column-wise distribution of missing values
print(df.isnull().sum())

Date                        0
Team                        0
Opponent                    0
Goal Scored                 0
Ball Possession %           0
Attempts                    0
On-Target                   0
Off-Target                  0
Blocked                     0
Corners                     0
Offsides                    0
Free Kicks                  0
Saves                       0
Pass Accuracy %             0
Passes                      0
Distance Covered (Kms)      0
Fouls Committed             0
Yellow Card                 0
Yellow & Red                0
Red                         0
Man of the Match            0
1st Goal                   34
Round                       0
PSO                         0
Goals in PSO                0
Own goals                 116
Own goal Time             116
dtype: int64


##### Basic statistics

In [43]:
#skewness and kurtosis
print("Skewness: %f" % df['Passes'].skew())
print("Kurtosis: %f" % df['Passes'].kurt())

Skewness: 0.773985
Kurtosis: 1.992385
