In [3]:
# Pandas Playground

In [4]:
import pandas as pd
import scipy
import seaborn as sns
from sklearn.datasets import load_wine

In [3]:
# Loading dataset
data = load_wine()
# Configuring pandas to show all features
pd.set_option("display.max_rows", None, "display.max_columns", None)
# Converting data to a dataframe to view properly
data = pd.DataFrame(data=data['data'],columns=data['feature_names'])
# Printing first 5 observations
print(data.head())


   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  
0                  

# Titanic

In [5]:
# load dataset titanic
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [7]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
df.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [9]:
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [12]:
# print all survivors details
df[df["survived"] == 1].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [14]:
df['class'].value_counts()

Third     491
First     216
Second    184
Name: class, dtype: int64

In [17]:
df.groupby('class')["fare"].max()

class
First     512.3292
Second     73.5000
Third      69.5500
Name: fare, dtype: float64

In [None]:
# Good comparison
# https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_r.html#compare-with-r

In [41]:
(df.query('age in [35, 38, 55]')
   .assign(new_column = df["age"]/df["fare"],
           new_column_1 = df["age"]/df["fare"])
   .loc[:,["survived", "sex", "class", "embark_town", "fare", "new_column", "new_column_1"]]
   .groupby(["embark_town", "sex"])
   .quantile([0.25,0.75])
   # .max("fare")
   # .sort_values(["new_column"])
   # .pivot_table(...)
   .rename(columns={'new_column': 'fraction'})
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,survived,fare,fraction,new_column_1
embark_town,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cherbourg,female,0.25,1.0,149.40415,0.117665,0.117665
Cherbourg,female,0.75,1.0,369.9271,0.350049,0.350049
Cherbourg,male,0.25,0.5,17.2229,0.693291,0.693291
Cherbourg,male,0.75,1.0,269.4396,2.875502,2.875502
Southampton,female,0.25,1.0,20.4375,0.479249,0.479249
Southampton,female,0.75,1.0,75.88125,1.712963,1.712963
Southampton,male,0.25,0.0,7.3177,1.460435,1.460435
Southampton,male,0.75,0.0,26.215625,4.887382,4.887382
