## Importing pandas

First, we need to import the `pandas` library. By convention, it is imported as `pd`.

In [1]:
import pandas as pd

## Loading Data

We will use CSV file and load the file into a DataFrame.

In [2]:
pengiuns_df=pd.read_csv('datafiles/penguins_size.csv')
pengiuns_df

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [3]:
shape=pengiuns_df.shape
print("Shape of the DataFrame:", shape)
print("Number of rows:", shape[0])
print("Number of columns:", shape[1])


Shape of the DataFrame: (344, 7)
Number of rows: 344
Number of columns: 7


In [4]:
pengiuns_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [5]:
# species_counts=pengiuns_df['species'].value_counts()
species_counts=pengiuns_df['species'].unique()
print("Count of each unique species:")
species_counts

Count of each unique species:


array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [6]:
nan_counts=pengiuns_df.isna().sum()
print("Number of NaN values per column:")
nan_counts

Number of NaN values per column:


species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [7]:
culmen_depth_summary=pengiuns_df.groupby(['island'])['culmen_depth_mm'].describe()
print("culmen_depth_mm for each island:")
culmen_depth_summary

culmen_depth_mm for each island:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
island,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Biscoe,167.0,15.87485,1.820721,13.1,14.5,15.5,17.0,21.1
Dream,124.0,18.344355,1.133116,15.5,17.5,18.4,19.0,21.2
Torgersen,51.0,18.429412,1.339447,15.9,17.35,18.4,19.25,21.5


In [8]:
pengiuns_df.groupby(['sex'])['body_mass_g'].aggregate(['min', 'max', 'mean', 'median'])

Unnamed: 0_level_0,min,max,mean,median
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
.,4875.0,4875.0,4875.0,4875.0
FEMALE,2700.0,5200.0,3862.272727,3650.0
MALE,3250.0,6300.0,4545.684524,4300.0


In [9]:
pengiuns_df.groupby(['island', 'species']).size().unstack(fill_value=0)

species,Adelie,Chinstrap,Gentoo
island,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Biscoe,44,0,124
Dream,56,68,0
Torgersen,52,0,0


In [10]:
female_mask = pengiuns_df['sex'] == 'FEMALE'
female_penguins = pengiuns_df[female_mask]
female_counts = female_penguins.groupby('species').size()
total_counts = pengiuns_df.groupby('species').size()
# Calculate the share of female penguins in each species
female_share = (female_counts / total_counts) * 100
print("Share of female penguins in each species (in %):")
female_share

Share of female penguins in each species (in %):


species
Adelie       48.026316
Chinstrap    50.000000
Gentoo       46.774194
dtype: float64

In [11]:
flipper_length_comparisons=pengiuns_df.groupby('species')['flipper_length_mm'].aggregate(['min', 'max', 'mean'])
print("Flipper length comparisons between species (min, max, mean):")
flipper_length_comparisons

Flipper length comparisons between species (min, max, mean):


Unnamed: 0_level_0,min,max,mean
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adelie,172.0,210.0,189.953642
Chinstrap,178.0,212.0,195.823529
Gentoo,203.0,231.0,217.186992


In [32]:
par1_df=pd.read_parquet('datafiles/away_team_12260075.parquet', engine='pyarrow')
par1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   match_id       1 non-null      int64  
 1   name           1 non-null      object 
 2   slug           1 non-null      object 
 3   gender         1 non-null      object 
 4   user_count     1 non-null      int64  
 5   residence      1 non-null      object 
 6   birthplace     1 non-null      object 
 7   height         1 non-null      float64
 8   weight         1 non-null      int64  
 9   plays          1 non-null      object 
 10  turned_pro     0 non-null      object 
 11  current_prize  1 non-null      int64  
 12  total_prize    1 non-null      int64  
 13  player_id      1 non-null      int64  
 14  current_rank   1 non-null      int64  
 15  name_code      1 non-null      object 
 16  country        1 non-null      object 
 17  full_name      1 non-null      object 
dtypes: float64(1),

In [25]:
folder_path = 'datafiles/raw_match_parquet'
import os
par_df = []

for file in os.listdir(folder_path):
    if file.endswith('.parquet'):
        # Read the Parquet file into a DataFrame
        df = pd.read_parquet(os.path.join(folder_path, file))
        par_df.append(df)
combined_df = pd.concat(par_df, ignore_index=True)

  combined_df = pd.concat(par_df, ignore_index=True)


In [28]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5740 entries, 0 to 5739
Data columns (total 62 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   match_id                         5740 non-null   int64  
 1   period_1                         1712 non-null   float64
 2   period_2                         1693 non-null   float64
 3   period_3                         452 non-null    float64
 4   period_4                         0 non-null      object 
 5   period_5                         0 non-null      object 
 6   current_period_start_timestamp   568 non-null    float64
 7   current_score                    1154 non-null   float64
 8   display_score                    1154 non-null   float64
 9   period_1_tie_break               100 non-null    float64
 10  period_2_tie_break               88 non-null     float64
 11  period_3_tie_break               24 non-null     float64
 12  period_4_tie_break  