1. Series: creation, descriptive statistics and indexing

In [1]:
import numpy as np
import pandas as pd
!pip install scikit_learn
from sklearn.datasets import load_iris



In [2]:
grades=pd.Series(range(80,100,2)) #one-dimensional object
print(grades.shape, grades.ndim)
print(grades)

(10,) 1
0    80
1    82
2    84
3    86
4    88
5    90
6    92
7    94
8    96
9    98
dtype: int64


In [3]:
#statistics
print(len(grades))
print(grades.describe())

10
count    10.000000
mean     89.000000
std       6.055301
min      80.000000
25%      84.500000
50%      89.000000
75%      93.500000
max      98.000000
dtype: float64


In [4]:
#controlling decimal points in printing
pd.set_option("display.precision", 2)
print(grades.describe())

count    10.00
mean     89.00
std       6.06
min      80.00
25%      84.50
50%      89.00
75%      93.50
max      98.00
dtype: float64


In [5]:
#specific statistics; supplying data to functions
print(grades.min(),grades.max(),grades.std())
print(f"minimum: {grades.min()}\tmaximum: {grades.max()}\tstandard deviation: {grades.std()}")

80 98 6.0553007081949835
minimum: 80	maximum: 98	standard deviation: 6.0553007081949835


In [6]:
#custom label indexes
height=pd.Series([180,182,190], index=["Laura","Maria","Sofia"])
print(height)

Laura    180
Maria    182
Sofia    190
dtype: int64


In [7]:
print(height[1],height["Maria"],height.Maria)

182 182 182


In [8]:
nations=pd.Series({"Cundinamarca":"Bogotá","Caldas":"Manizales","Antioquia":"Medellín"})
print(nations)

Cundinamarca       Bogotá
Caldas          Manizales
Antioquia        Medellín
dtype: object


2.DataFrames

#Making a dataframe using dictionary


In [9]:
season_temps=pd.DataFrame({"Spring":[1,2,3],"Summer":[4,5,6],"Autumn":[7,8,9],"Winter":[10,11,12]})
season_temps

Unnamed: 0,Spring,Summer,Autumn,Winter
0,1,4,7,10
1,2,5,8,11
2,3,6,9,12


In [10]:
season_temps.T

Unnamed: 0,0,1,2
Spring,1,2,3
Summer,4,5,6
Autumn,7,8,9
Winter,10,11,12


In [11]:
season_temps

Unnamed: 0,Spring,Summer,Autumn,Winter
0,1,4,7,10
1,2,5,8,11
2,3,6,9,12


In [12]:
#dataframe from array
scores=np.array([[1,2,3],[4,5,6],[7,8,9]])
print(scores.ndim,scores.shape,type(scores))

2 (3, 3) <class 'numpy.ndarray'>


In [13]:
scores_pd=pd.DataFrame(scores)
print(scores_pd)
print(scores_pd.shape)

   0  1  2
0  1  2  3
1  4  5  6
2  7  8  9
(3, 3)


In [14]:
#setting column index and row index
scores_pd=pd.DataFrame(scores,columns=["Maria","Sofia","Ana"],index=["First term","Second term","Third term"])
scores_pd

Unnamed: 0,Maria,Sofia,Ana
First term,1,2,3
Second term,4,5,6
Third term,7,8,9


In [15]:
scores_pd.T

Unnamed: 0,First term,Second term,Third term
Maria,1,4,7
Sofia,2,5,8
Ana,3,6,9


In [16]:
#describe method applies to columns
scores_pd.describe()

Unnamed: 0,Maria,Sofia,Ana
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


In [17]:
scores_pd.mean()

Maria    4.0
Sofia    5.0
Ana      6.0
dtype: float64

In [18]:
scores_pd.T.describe()

Unnamed: 0,First term,Second term,Third term
count,3.0,3.0,3.0
mean,2.0,5.0,8.0
std,1.0,1.0,1.0
min,1.0,4.0,7.0
25%,1.5,4.5,7.5
50%,2.0,5.0,8.0
75%,2.5,5.5,8.5
max,3.0,6.0,9.0


3. DataFrame slicing

In [19]:
#column slicing with customized attribute indexes
season_temps[["Spring","Autumn"]] #double brackets for extraction of columns

Unnamed: 0,Spring,Autumn
0,1,7
1,2,8
2,3,9


In [20]:
season_temps.loc[:,["Winter"]] # label indexing

Unnamed: 0,Winter
0,10
1,11
2,12


In [21]:
season_temps.Summer #by calling the column

0    4
1    5
2    6
Name: Summer, dtype: int64

In [22]:
season_temps.iloc[:,0:2] #integer indexing, consecutive columns

Unnamed: 0,Spring,Summer
0,1,4
1,2,5
2,3,6


In [23]:
season_temps.iloc[:,[1,3]] #integer indexing, non-consecutive columns

Unnamed: 0,Summer,Winter
0,4,10
1,5,11
2,6,12


In [24]:
scores_pd[1:3] #row slicing

Unnamed: 0,Maria,Sofia,Ana
Second term,4,5,6
Third term,7,8,9


In [25]:
scores_pd.loc["Second term",["Maria"]]

Maria    4
Name: Second term, dtype: int32

In [26]:
scores_pd.iloc[1,[0]]

Maria    4
Name: Second term, dtype: int32

In [27]:
scores_pd

Unnamed: 0,Maria,Sofia,Ana
First term,1,2,3
Second term,4,5,6
Third term,7,8,9


In [28]:
scores_pd.iloc[1]={"Maria":3,"Sofia":3,"Ana":1} #replacing information

In [29]:
scores_pd

Unnamed: 0,Maria,Sofia,Ana
First term,1,2,3
Second term,3,3,1
Third term,7,8,9


In [30]:
scores_pd.iloc[1]={"Maria":3,"Sofia":3,"Ana":1} #changing values in index 1

In [31]:
scores_pd

Unnamed: 0,Maria,Sofia,Ana
First term,1,2,3
Second term,3,3,1
Third term,7,8,9


In [32]:
scores_pd.at["First term","Maria"] #getting a specific element with label indexing

1

In [33]:
scores_pd.iat[0,0]#getting a specific element with integer indexing

1

4. Sorting by index and value

In [34]:
scores_pd.sort_index(ascending=False) #sorting by label index

Unnamed: 0,Maria,Sofia,Ana
Third term,7,8,9
Second term,3,3,1
First term,1,2,3


In [35]:
season_temps.sort_index(ascending=False) #sorting by integer

Unnamed: 0,Spring,Summer,Autumn,Winter
2,3,6,9,12
1,2,5,8,11
0,1,4,7,10


In [36]:
scores_pd.sort_index(axis=1, ascending=False)

Unnamed: 0,Sofia,Maria,Ana
First term,2,1,3
Second term,3,3,1
Third term,8,7,9


In [37]:
scores_pd.sort_values(by ="First term", axis=1, ascending=False)

Unnamed: 0,Ana,Sofia,Maria
First term,3,2,1
Second term,1,3,3
Third term,9,8,7


In [38]:
scores_pd.sort_values(by="Third term",ascending=True,axis=1)

Unnamed: 0,Maria,Sofia,Ana
First term,1,2,3
Second term,3,3,1
Third term,7,8,9


In [39]:
scores_pd.T.sort_values(by="Third term")

Unnamed: 0,First term,Second term,Third term
Maria,1,3,7
Sofia,2,3,8
Ana,3,1,9


In [40]:
scores_pd.T.sort_values(by="Third term", ascending=False)

Unnamed: 0,First term,Second term,Third term
Ana,3,1,9
Sofia,2,3,8
Maria,1,3,7


In [41]:
scores_pd[scores_pd >=2] #boolean indexing

Unnamed: 0,Maria,Sofia,Ana
First term,,2,3.0
Second term,3.0,3,
Third term,7.0,8,9.0


In [42]:
scores_pd[(scores_pd >=2)&(scores_pd <=5)]

Unnamed: 0,Maria,Sofia,Ana
First term,,2.0,3.0
Second term,3.0,3.0,
Third term,,,


In [43]:
scores_pd[lambda x: x<8]

Unnamed: 0,Maria,Sofia,Ana
First term,1,2.0,3.0
Second term,3,3.0,1.0
Third term,7,,


5. DataFrame with Sk-learn dataset

In [44]:
#laoding embedded sklearn dataset
iris_data=load_iris()
iris_data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [45]:
print(iris_data.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [46]:
iris_features,iris_target=iris_data["data"],iris_data["target"]

In [47]:
print(iris_features.shape, iris_target.shape)

(150, 4) (150,)


In [48]:
iris_full=np.hstack([iris_features,iris_target.reshape(150,1)])

In [49]:
print(iris_full)

[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]
 [5.  3.4 1.5 0.2 0. ]
 [4.4 2.9 1.4 0.2 0. ]
 [4.9 3.1 1.5 0.1 0. ]
 [5.4 3.7 1.5 0.2 0. ]
 [4.8 3.4 1.6 0.2 0. ]
 [4.8 3.  1.4 0.1 0. ]
 [4.3 3.  1.1 0.1 0. ]
 [5.8 4.  1.2 0.2 0. ]
 [5.7 4.4 1.5 0.4 0. ]
 [5.4 3.9 1.3 0.4 0. ]
 [5.1 3.5 1.4 0.3 0. ]
 [5.7 3.8 1.7 0.3 0. ]
 [5.1 3.8 1.5 0.3 0. ]
 [5.4 3.4 1.7 0.2 0. ]
 [5.1 3.7 1.5 0.4 0. ]
 [4.6 3.6 1.  0.2 0. ]
 [5.1 3.3 1.7 0.5 0. ]
 [4.8 3.4 1.9 0.2 0. ]
 [5.  3.  1.6 0.2 0. ]
 [5.  3.4 1.6 0.4 0. ]
 [5.2 3.5 1.5 0.2 0. ]
 [5.2 3.4 1.4 0.2 0. ]
 [4.7 3.2 1.6 0.2 0. ]
 [4.8 3.1 1.6 0.2 0. ]
 [5.4 3.4 1.5 0.4 0. ]
 [5.2 4.1 1.5 0.1 0. ]
 [5.5 4.2 1.4 0.2 0. ]
 [4.9 3.1 1.5 0.2 0. ]
 [5.  3.2 1.2 0.2 0. ]
 [5.5 3.5 1.3 0.2 0. ]
 [4.9 3.6 1.4 0.1 0. ]
 [4.4 3.  1.3 0.2 0. ]
 [5.1 3.4 1.5 0.2 0. ]
 [5.  3.5 1.3 0.3 0. ]
 [4.5 2.3 1.3 0.3 0. ]
 [4.4 3.2 1.3 0.2 0. ]
 [5.  3.5 1

In [50]:
print(iris_full.shape)

(150, 5)


In [51]:
iris_full[:5,]

array([[5.1, 3.5, 1.4, 0.2, 0. ],
       [4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [4.6, 3.1, 1.5, 0.2, 0. ],
       [5. , 3.6, 1.4, 0.2, 0. ]])

In [52]:
#making a data frame from array
iris_feature_df=pd.DataFrame(iris_features,columns=iris_data["feature_names"])
iris_feature_df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [53]:
iris_feature_df.shape

(150, 4)

In [54]:
iris_feature_df.tail()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8


In [55]:
#renaming
iris_feature_df.columns=["sepal_length","sepal_width","petal_length","petal_width"]

In [56]:
iris_feature_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [57]:
iris_feature_df.rename(columns={"sepal_length":"Sepal Length","sepal_width":"Sepal Width","petal_length":"Petal Length",
                                "petal_width":"Petal Width"},inplace=True)
#inplace changes column names in the original dataset

In [58]:
iris_feature_df

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [59]:
iris_target_df=pd.DataFrame(iris_target,columns=["target"])
iris_target_df

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
145,2
146,2
147,2
148,2


In [60]:
iris_data["target_names"]

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [61]:
#making label variable from target variable
iris_target_df.loc[iris_target_df["target"]==0, "label"]="setosa"
iris_target_df.loc[iris_target_df["target"]==1,"label"]="versicolor"
iris_target_df.loc[iris_target_df["target"]==2,"label"]="virginica"

In [62]:
iris_target_df.tail()

Unnamed: 0,target,label
145,2,virginica
146,2,virginica
147,2,virginica
148,2,virginica
149,2,virginica


In [63]:
#combining dataframes with pd.concat function
iris_full_df=pd.concat([iris_feature_df,iris_target_df],axis=1)
iris_full_df.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,target,label
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [64]:
iris_full_df.shape

(150, 6)

In [65]:
#subsetting
setosa_df=iris_full_df[iris_full_df.label=="setosa"]

In [66]:
print(setosa_df)

    Sepal Length  Sepal Width  Petal Length  Petal Width  target   label
0            5.1          3.5           1.4          0.2       0  setosa
1            4.9          3.0           1.4          0.2       0  setosa
2            4.7          3.2           1.3          0.2       0  setosa
3            4.6          3.1           1.5          0.2       0  setosa
4            5.0          3.6           1.4          0.2       0  setosa
5            5.4          3.9           1.7          0.4       0  setosa
6            4.6          3.4           1.4          0.3       0  setosa
7            5.0          3.4           1.5          0.2       0  setosa
8            4.4          2.9           1.4          0.2       0  setosa
9            4.9          3.1           1.5          0.1       0  setosa
10           5.4          3.7           1.5          0.2       0  setosa
11           4.8          3.4           1.6          0.2       0  setosa
12           4.8          3.0           1.4        

In [67]:
versicolor_df=iris_full_df[iris_full_df.label=="versicolor"]

In [68]:
versicolor_df

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,target,label
50,7.0,3.2,4.7,1.4,1,versicolor
51,6.4,3.2,4.5,1.5,1,versicolor
52,6.9,3.1,4.9,1.5,1,versicolor
53,5.5,2.3,4.0,1.3,1,versicolor
54,6.5,2.8,4.6,1.5,1,versicolor
55,5.7,2.8,4.5,1.3,1,versicolor
56,6.3,3.3,4.7,1.6,1,versicolor
57,4.9,2.4,3.3,1.0,1,versicolor
58,6.6,2.9,4.6,1.3,1,versicolor
59,5.2,2.7,3.9,1.4,1,versicolor


In [69]:
virginica_df=iris_full_df[iris_full_df.label=="virginica"]

In [70]:
virginica_df.reset_index() #resetting index

Unnamed: 0,index,Sepal Length,Sepal Width,Petal Length,Petal Width,target,label
0,100,6.3,3.3,6.0,2.5,2,virginica
1,101,5.8,2.7,5.1,1.9,2,virginica
2,102,7.1,3.0,5.9,2.1,2,virginica
3,103,6.3,2.9,5.6,1.8,2,virginica
4,104,6.5,3.0,5.8,2.2,2,virginica
5,105,7.6,3.0,6.6,2.1,2,virginica
6,106,4.9,2.5,4.5,1.7,2,virginica
7,107,7.3,2.9,6.3,1.8,2,virginica
8,108,6.7,2.5,5.8,1.8,2,virginica
9,109,7.2,3.6,6.1,2.5,2,virginica


In [71]:
virginica_df.reset_index(drop=True) #resetting index, dropping the previous index

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,target,label
0,6.3,3.3,6.0,2.5,2,virginica
1,5.8,2.7,5.1,1.9,2,virginica
2,7.1,3.0,5.9,2.1,2,virginica
3,6.3,2.9,5.6,1.8,2,virginica
4,6.5,3.0,5.8,2.2,2,virginica
5,7.6,3.0,6.6,2.1,2,virginica
6,4.9,2.5,4.5,1.7,2,virginica
7,7.3,2.9,6.3,1.8,2,virginica
8,6.7,2.5,5.8,1.8,2,virginica
9,7.2,3.6,6.1,2.5,2,virginica


In [72]:
again_previous_dataset=pd.concat([virginica_df,setosa_df,versicolor_df],axis=0).reset_index(drop=True)

In [73]:
again_previous_dataset.tail()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,target,label
145,5.7,3.0,4.2,1.2,1,versicolor
146,5.7,2.9,4.2,1.3,1,versicolor
147,6.2,2.9,4.3,1.3,1,versicolor
148,5.1,2.5,3.0,1.1,1,versicolor
149,5.7,2.8,4.1,1.3,1,versicolor


In [74]:
again_previous_dataset_1=again_previous_dataset.drop(columns=["Sepal Length"])

In [75]:
print(again_previous_dataset_1)

     Sepal Width  Petal Length  Petal Width  target       label
0            3.3           6.0          2.5       2   virginica
1            2.7           5.1          1.9       2   virginica
2            3.0           5.9          2.1       2   virginica
3            2.9           5.6          1.8       2   virginica
4            3.0           5.8          2.2       2   virginica
..           ...           ...          ...     ...         ...
145          3.0           4.2          1.2       1  versicolor
146          2.9           4.2          1.3       1  versicolor
147          2.9           4.3          1.3       1  versicolor
148          2.5           3.0          1.1       1  versicolor
149          2.8           4.1          1.3       1  versicolor

[150 rows x 5 columns]


In [76]:
class_df=pd.DataFrame({"lec_id":["F1501","F1502","F1503","F1504","F1505"],
                      "Stu_reg":[70,55,14,18,50],
                      "course_n":["ECN101", "BTM103","BTM104","BTM105","ECN201"],
                      "CAG":[3.0,2.98,3.3,3.4,2.7]})
crs_eval_df=pd.DataFrame({"lec_id":["F1501","F1502","F1503","F1506","F1507"],
                      "Stu_reg":[70,55,14,45,65],
                      "course_n":["ECN101", "BTM103","BTM104","BTM104","ECN201"],
                      "CES":[4.0,3.9,4.3,4.4,4.7]})

In [77]:
#merging dataframes based on a key index
merged_df0=pd.merge(class_df,crs_eval_df,on="lec_id")

In [78]:
merged_df0

Unnamed: 0,lec_id,Stu_reg_x,course_n_x,CAG,Stu_reg_y,course_n_y,CES
0,F1501,70,ECN101,3.0,70,ECN101,4.0
1,F1502,55,BTM103,2.98,55,BTM103,3.9
2,F1503,14,BTM104,3.3,14,BTM104,4.3


In [79]:
merged_df1=pd.merge(class_df,crs_eval_df,on=["lec_id","course_n"])

In [80]:
merged_df1

Unnamed: 0,lec_id,Stu_reg_x,course_n,CAG,Stu_reg_y,CES
0,F1501,70,ECN101,3.0,70,4.0
1,F1502,55,BTM103,2.98,55,3.9
2,F1503,14,BTM104,3.3,14,4.3


In [81]:
#Dataframe column slicing
merged_df1[["lec_id","course_n","CAG"]]

Unnamed: 0,lec_id,course_n,CAG
0,F1501,ECN101,3.0
1,F1502,BTM103,2.98
2,F1503,BTM104,3.3


In [82]:
merged_df2=pd.merge(class_df,crs_eval_df,how="left",on=["lec_id","course_n"])

In [83]:
merged_df2

Unnamed: 0,lec_id,Stu_reg_x,course_n,CAG,Stu_reg_y,CES
0,F1501,70,ECN101,3.0,70.0,4.0
1,F1502,55,BTM103,2.98,55.0,3.9
2,F1503,14,BTM104,3.3,14.0,4.3
3,F1504,18,BTM105,3.4,,
4,F1505,50,ECN201,2.7,,


In [84]:
merged_df3=pd.merge(class_df,crs_eval_df,how="right",on=["lec_id","course_n"])

In [85]:
merged_df3

Unnamed: 0,lec_id,Stu_reg_x,course_n,CAG,Stu_reg_y,CES
0,F1501,70.0,ECN101,3.0,70,4.0
1,F1502,55.0,BTM103,2.98,55,3.9
2,F1503,14.0,BTM104,3.3,14,4.3
3,F1506,,BTM104,,45,4.4
4,F1507,,ECN201,,65,4.7


In [87]:
merged_df4=pd.merge(class_df,crs_eval_df,how="outer",on=["lec_id","course_n"])

In [88]:
merged_df4

Unnamed: 0,lec_id,Stu_reg_x,course_n,CAG,Stu_reg_y,CES
0,F1501,70.0,ECN101,3.0,70.0,4.0
1,F1502,55.0,BTM103,2.98,55.0,3.9
2,F1503,14.0,BTM104,3.3,14.0,4.3
3,F1504,18.0,BTM105,3.4,,
4,F1505,50.0,ECN201,2.7,,
5,F1506,,BTM104,,45.0,4.4
6,F1507,,ECN201,,65.0,4.7


In [89]:
class_df1=class_df.set_index(["lec_id","course_n"])
class_df1

Unnamed: 0_level_0,Unnamed: 1_level_0,Stu_reg,CAG
lec_id,course_n,Unnamed: 2_level_1,Unnamed: 3_level_1
F1501,ECN101,70,3.0
F1502,BTM103,55,2.98
F1503,BTM104,14,3.3
F1504,BTM105,18,3.4
F1505,ECN201,50,2.7


In [96]:
crs_eval_df2=crs_eval_df.set_index(["lec_id","course_n"])[["CES"]]

In [97]:
crs_eval_df2

Unnamed: 0_level_0,Unnamed: 1_level_0,CES
lec_id,course_n,Unnamed: 2_level_1
F1501,ECN101,4.0
F1502,BTM103,3.9
F1503,BTM104,4.3
F1506,BTM104,4.4
F1507,ECN201,4.7


In [98]:
merged_df4=class_df1.join(crs_eval_df2,how="outer")
merged_df4

Unnamed: 0_level_0,Unnamed: 1_level_0,Stu_reg,CAG,CES
lec_id,course_n,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F1501,ECN101,70.0,3.0,4.0
F1502,BTM103,55.0,2.98,3.9
F1503,BTM104,14.0,3.3,4.3
F1504,BTM105,18.0,3.4,
F1505,ECN201,50.0,2.7,
F1506,BTM104,,,4.4
F1507,ECN201,,,4.7


In [103]:
merged_df4[merged_df4["CES"].isin([3.9])]
merged_df4

Unnamed: 0_level_0,Unnamed: 1_level_0,Stu_reg,CAG,CES
lec_id,course_n,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F1501,ECN101,70.0,3.0,4.0
F1502,BTM103,55.0,2.98,3.9
F1503,BTM104,14.0,3.3,4.3
F1504,BTM105,18.0,3.4,
F1505,ECN201,50.0,2.7,
F1506,BTM104,,,4.4
F1507,ECN201,,,4.7


In [104]:
merged_df4.dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,Stu_reg,CAG,CES
lec_id,course_n,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F1501,ECN101,70.0,3.0,4.0
F1502,BTM103,55.0,2.98,3.9
F1503,BTM104,14.0,3.3,4.3


7. Descriptive statistics and one hot vector

In [105]:
iris_full_df.describe() #descriptive statistics with full numerical variables

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,target
count,150.0,150.0,150.0,150.0,150.0
mean,5.84,3.06,3.76,1.2,1.0
std,0.83,0.44,1.77,0.76,0.82
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [106]:
#descriptive statistics with a subset
iris_full_df.iloc[:,:4].describe()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
count,150.0,150.0,150.0,150.0
mean,5.84,3.06,3.76,1.2
std,0.83,0.44,1.77,0.76
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [107]:
iris_full_df.drop(["target"], axis=1).describe()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
count,150.0,150.0,150.0,150.0
mean,5.84,3.06,3.76,1.2
std,0.83,0.44,1.77,0.76
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [109]:
iris_full_df[["Sepal Length"]].describe()

Unnamed: 0,Sepal Length
count,150.0
mean,5.84
std,0.83
min,4.3
25%,5.1
50%,5.8
75%,6.4
max,7.9


In [110]:
iris_full_df.groupby(["label"]).describe() #by group

Unnamed: 0_level_0,Sepal Length,Sepal Length,Sepal Length,Sepal Length,Sepal Length,Sepal Length,Sepal Length,Sepal Length,Sepal Width,Sepal Width,...,Petal Width,Petal Width,target,target,target,target,target,target,target,target
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
setosa,50.0,5.01,0.35,4.3,4.8,5.0,5.2,5.8,50.0,3.43,...,0.3,0.6,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
versicolor,50.0,5.94,0.52,4.9,5.6,5.9,6.3,7.0,50.0,2.77,...,1.5,1.8,50.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
virginica,50.0,6.59,0.64,4.9,6.22,6.5,6.9,7.9,50.0,2.97,...,2.3,2.5,50.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0


In [113]:
iris_full_df.iloc[:,:6].groupby(["label"]).describe().T

Unnamed: 0,label,setosa,versicolor,virginica
Sepal Length,count,50.0,50.0,50.0
Sepal Length,mean,5.01,5.94,6.59
Sepal Length,std,0.35,0.52,0.64
Sepal Length,min,4.3,4.9,4.9
Sepal Length,25%,4.8,5.6,6.22
Sepal Length,50%,5.0,5.9,6.5
Sepal Length,75%,5.2,6.3,6.9
Sepal Length,max,5.8,7.0,7.9
Sepal Width,count,50.0,50.0,50.0
Sepal Width,mean,3.43,2.77,2.97


In [114]:
iris_full_df.drop(["target"],axis=1).corr()

  iris_full_df.drop(["target"],axis=1).corr()


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
Sepal Length,1.0,-0.12,0.87,0.82
Sepal Width,-0.12,1.0,-0.43,-0.37
Petal Length,0.87,-0.43,1.0,0.96
Petal Width,0.82,-0.37,0.96,1.0
