# ⌨️ (1:41:41) Introduction to Pandas and Numpy

# 1 Creating a dataframe from an array

In [1]:
import pandas as pd
import numpy as np

## 1.1 Option 1

In [2]:
# creating an array
data = np.array([[1, 4], [2, 6], [-9, 7]])

In [3]:
data

array([[ 1,  4],
       [ 2,  6],
       [-9,  7]])

In [4]:
# creating a dataframe
df = pd.DataFrame(data, index=['row_one', 'row_two', 'row_three'], columns=['col_one', 'col_two'])

In [5]:
df

Unnamed: 0,col_one,col_two
row_one,1,4
row_two,2,6
row_three,-9,7


## 2.2 Option 2

In [6]:
data = [['Alice', 25, 'New York'],
            ['Bob', 30, 'London'],
            ['Charlie', 35, 'Paris']]


In [7]:
df = pd.DataFrame(data, columns=['Name', 'Age', 'City'])

In [8]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,London
2,Charlie,35,Paris


# 2 Creating a dataframe from a Dictionary

In [9]:
states = ["California", "Texas", "New York", "Florida", "Illinois"]
population = [39_431_263, 31_290_831, 19_867_248, 23_372_215, 12_710_158]

In [10]:
dict_states = {'States': states, 'Population': population}

In [11]:
# creating a dataframe
df_dict_states = pd.DataFrame(dict_states)

In [12]:
df_dict_states

Unnamed: 0,States,Population
0,California,39431263
1,Texas,31290831
2,New York,19867248
3,Florida,23372215
4,Illinois,12710158


# 3 Creating a dataframe from a csv file

In [108]:
# reading the csv file
df_exams = pd.read_csv('StudentsPerformance.csv')

In [109]:
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


In [15]:
# showing 5 first rows
df_exams.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [16]:
# showing 5 last rows
df_exams.tail()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


In [17]:
# showing n rows in dataframe
df_exams.head(10)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


In [18]:
df_exams.tail(10)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
990,male,group E,high school,free/reduced,completed,86,81,75
991,female,group B,some high school,standard,completed,65,82,78
992,female,group D,associate's degree,free/reduced,none,55,76,76
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
994,male,group A,high school,standard,none,63,63,62
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


In [21]:
# display all rows
pd.set_option('display.max_rows', 1000)
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


In [20]:
# 2:16:00

# 4 Basic Attributes, Methods and Functions

## 4.1 Attributes

In [23]:
df_exams.shape

(1000, 8)

In [24]:
df_exams.index

RangeIndex(start=0, stop=1000, step=1)

In [25]:
df_exams.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [26]:
df_exams.dtypes

gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object

## 4.2 Methods

In [105]:
df_exams.head(10)

Unnamed: 0,gender,test of insert,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score,score_one,score_two
0,female,9,group B,bachelor's degree,standard,none,72,72,74,76,9,29
1,female,24,group C,some college,standard,completed,69,90,88,17,24,23
2,female,99,group B,master's degree,standard,none,90,95,93,16,99,20
3,male,24,group A,associate's degree,free/reduced,none,47,57,44,23,24,36
4,male,11,group C,some college,standard,none,76,78,75,93,11,60


In [31]:
df_exams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [32]:
# describing basic statistics
df_exams.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


## 4.3 Functions

In [33]:
# get the highest index in dataframe
max(df_exams)

'writing score'

In [36]:
max(df_exams.index)

999

In [39]:
# get the lowest index in dataframe
print(min(df_exams))
print(min(df_exams.index))

gender
0


# 5. Selecting One Column from a DataFrame

In [40]:
df_exams.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


## 5.1 Syntax 1

In [41]:
df_exams['gender']

0      female
1      female
2      female
3        male
4        male
5      female
6      female
7        male
8        male
9      female
10       male
11       male
12     female
13       male
14     female
15     female
16       male
17     female
18       male
19     female
20       male
21     female
22       male
23     female
24       male
25       male
26       male
27     female
28       male
29     female
30     female
31     female
32     female
33       male
34       male
35       male
36     female
37     female
38     female
39       male
40       male
41     female
42     female
43       male
44     female
45       male
46     female
47     female
48     female
49       male
50       male
51       male
52       male
53       male
54     female
55     female
56     female
57       male
58       male
59     female
60       male
61       male
62       male
63     female
64     female
65       male
66       male
67     female
68       male
69     female
70     female
71    

In [42]:
type(df_exams['gender'])

pandas.core.series.Series

In [43]:
df_exams['gender'].index

RangeIndex(start=0, stop=1000, step=1)

In [106]:
df_exams['gender'].head(100)

0    female
1    female
2    female
3      male
4      male
Name: gender, dtype: object

In [46]:
# select 2 columns 
df_exams[['gender', 'math score']].head()

Unnamed: 0,gender,math score
0,female,72
1,female,69
2,female,90
3,male,47
4,male,76


In [47]:
df_exams[['gender', 'math score', 'reading score','writing score']].head()

Unnamed: 0,gender,math score,reading score,writing score
0,female,72,72,74
1,female,69,90,88
2,female,90,95,93
3,male,47,57,44
4,male,76,78,75


In [48]:
df_exams[['writing score', 'math score', 'reading score','gender']].head()

Unnamed: 0,writing score,math score,reading score,gender
0,74,72,72,female
1,88,69,90,female
2,93,90,95,female
3,44,47,57,male
4,75,76,78,male


# 6. Add new column to a DataFrame

In [107]:
df_exams['language score'] = 70
df_exams

Unnamed: 0,gender,test of insert,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score,score_one,score_two
0,female,9,group B,bachelor's degree,standard,none,72,72,74,70,9,29
1,female,24,group C,some college,standard,completed,69,90,88,70,24,23
2,female,99,group B,master's degree,standard,none,90,95,93,70,99,20
3,male,24,group A,associate's degree,free/reduced,none,47,57,44,70,24,36
4,male,11,group C,some college,standard,none,76,78,75,70,11,60


## 6.1 add new column with an array

In [52]:
# create array with 1000 items
language_score = np.arange(0, 1000)

In [53]:
len(language_score)

1000

In [54]:
# adding new column to a dataframe
df_exams['language score'] = language_score

In [55]:
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,0
1,female,group C,some college,standard,completed,69,90,88,1
2,female,group B,master's degree,standard,none,90,95,93,2
3,male,group A,associate's degree,free/reduced,none,47,57,44,3
4,male,group C,some college,standard,none,76,78,75,4
5,female,group B,associate's degree,standard,none,71,83,78,5
6,female,group B,some college,standard,completed,88,95,92,6
7,male,group B,some college,free/reduced,none,40,43,39,7
8,male,group D,high school,free/reduced,completed,64,64,67,8
9,female,group B,high school,free/reduced,none,38,60,50,9


In [58]:
# create random integers numbers between 1 and 1000
np.random.randint(1, 100, size=1000)

array([ 1, 40, 77, 58,  5, 51, 31, 64, 94, 39, 49, 62, 10, 94, 80, 65, 34,
       58, 14, 49, 25,  3, 74, 16, 30, 46, 57, 43, 48, 28, 17, 57, 77, 23,
       11, 41, 48, 30, 55, 32, 54, 25,  5, 39, 72, 75, 75, 34, 43, 91, 75,
       44, 18, 31, 90, 21, 21, 74, 67, 68, 73, 80, 40, 25, 29, 68, 19, 38,
       90, 30,  4, 36, 89, 71, 68, 49,  2, 55, 17, 33, 72, 98, 73, 85, 65,
       16, 49, 49, 47, 92, 23, 41, 30, 21,  8, 55,  8, 72, 25, 72, 82, 15,
       29, 71, 96, 61, 50, 62, 41, 10,  7,  3, 88, 98, 77, 19, 83, 45, 25,
       35, 98, 61,  7, 66, 50, 92, 50,  7, 84, 54, 75, 30, 59, 58, 62, 71,
       56, 98, 64, 88, 79, 32, 89, 10, 79, 54, 81, 11, 96, 22, 65, 69, 60,
       44, 52, 20, 34, 77, 70,  4, 17, 57, 21, 36, 88,  4, 11, 10, 25,  3,
       51, 70, 78, 53,  9, 64, 73, 29, 10, 12, 90, 97, 23,  4, 11, 78, 56,
        9, 77, 86, 43,  9, 52, 30, 21, 19, 43, 78, 20, 30, 92, 46, 79, 54,
       67, 39, 59, 76, 20, 97,  3, 26, 73, 20, 70, 46, 67, 11, 39, 38, 69,
       96, 92, 74, 61, 27

In [59]:
df_exams['language score'] = np.random.randint(1, 100, size=1000)

In [60]:
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,76
1,female,group C,some college,standard,completed,69,90,88,17
2,female,group B,master's degree,standard,none,90,95,93,16
3,male,group A,associate's degree,free/reduced,none,47,57,44,23
4,male,group C,some college,standard,none,76,78,75,93
5,female,group B,associate's degree,standard,none,71,83,78,49
6,female,group B,some college,standard,completed,88,95,92,62
7,male,group B,some college,free/reduced,none,40,43,39,82
8,male,group D,high school,free/reduced,completed,64,64,67,71
9,female,group B,high school,free/reduced,none,38,60,50,70


In [62]:
# float numbers
np.random.uniform(1, 100, size=100)

array([64.95437125, 31.26519182, 14.94913473, 10.89450236, 42.68181482,
       86.80614527, 57.36235289, 35.59078214, 33.83933255, 38.99741201,
       64.42459285, 57.72632645, 62.69736918, 95.3563712 , 34.03449171,
       50.47752084, 21.91831985, 73.96423581, 46.86291957, 74.46374157,
       85.6243568 , 17.66610597, 89.85141131, 43.24657413, 97.23662084,
       95.94922902, 38.40833068, 50.07175399, 51.47887733, 11.06261718,
       65.15252131, 31.83072044,  8.97458207,  8.90554444, 67.61286525,
       59.26394683, 27.60150539, 13.50551337, 64.31863327, 45.72800628,
       64.87444673, 18.35197184, 59.73536032, 41.36180868, 71.7130043 ,
       31.75010185, 47.05897485, 23.53024264, 45.86208247, 67.62861845,
       38.18908136, 38.95842427, 80.39866271, 32.82433209, 98.71197617,
       32.77137501, 82.44304086, 26.02508076, 29.86495354, 86.89012463,
        6.8298063 , 12.1901744 , 33.9100402 , 10.26436671, 88.01405966,
       34.3140272 , 11.37056252, 64.49727545, 61.63695687, 87.69

## 6.1 Assign()

In [84]:
score_one = np.random.randint(1, 100, size=1000)
score_two = np.random.randint(1, 100, size=1000)

In [85]:
series_one = pd.Series(score_one, index=np.arange(0, 1000))
series_two = pd.Series(score_two, index=np.arange(0, 1000))

In [99]:
df_exams = df_exams.assign(score_one=series_one, score_two=series_two)

In [101]:
df_exams

Unnamed: 0,gender,test of insert,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score,score_one,score_two
0,female,9,group B,bachelor's degree,standard,none,72,72,74,76,9,29
1,female,24,group C,some college,standard,completed,69,90,88,17,24,23
2,female,99,group B,master's degree,standard,none,90,95,93,16,99,20
3,male,24,group A,associate's degree,free/reduced,none,47,57,44,23,24,36
4,male,11,group C,some college,standard,none,76,78,75,93,11,60


## 6.2 insert()

In [88]:
# using index() to add a column at specific position
df_exams.insert(1, 'test of insert', series_one)

In [102]:
df_exams

Unnamed: 0,gender,test of insert,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score,score_one,score_two
0,female,9,group B,bachelor's degree,standard,none,72,72,74,76,9,29
1,female,24,group C,some college,standard,completed,69,90,88,17,24,23
2,female,99,group B,master's degree,standard,none,90,95,93,16,99,20
3,male,24,group A,associate's degree,free/reduced,none,47,57,44,23,24,36
4,male,11,group C,some college,standard,none,76,78,75,93,11,60


# 7. Math Operations

## 7.1 Operations in columns

In [111]:
df_exams['math score'].sum()

np.int64(66089)

In [112]:
df_exams.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [114]:
df_exams['math score'].mean()

np.float64(66.089)

## 7.2 Operations in rows

In [115]:
# calculating the sum in rows
df_exams['math score'] + df_exams['reading score'] + df_exams['writing score']

0      218
1      247
2      278
3      148
4      229
5      232
6      275
7      122
8      195
9      148
10     164
11     135
12     219
13     220
14     161
15     222
16     263
17      78
18     134
19     173
20     198
21     210
22     151
23     215
24     225
25     219
26     178
27     211
28     205
29     207
30     217
31     189
32     193
33     120
34     266
35     241
36     238
37     173
38     253
39     170
40     170
41     199
42     176
43     190
44     160
45     176
46     182
47     213
48     207
49     248
50     156
51     214
52     139
53     241
54     242
55     117
56     253
57     156
58     175
59      27
60     225
61     112
62     178
63     220
64     176
65     192
66     119
67     206
68     175
69     160
70     194
71     181
72     140
73     174
74     139
75     123
76      78
77     239
78     207
79     198
80     146
81     139
82     151
83     199
84     115
85     235
86     247
87     216
88     195
89     241
90     211

In [118]:
df_exams['average'] = (df_exams['math score'] + df_exams['reading score'] + df_exams['writing score'])/3

In [121]:
df_exams.round(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average
0,female,group B,bachelor's degree,standard,none,72,72,74,72.66667
1,female,group C,some college,standard,completed,69,90,88,82.33333
2,female,group B,master's degree,standard,none,90,95,93,92.66667
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.33333
4,male,group C,some college,standard,none,76,78,75,76.33333
5,female,group B,associate's degree,standard,none,71,83,78,77.33333
6,female,group B,some college,standard,completed,88,95,92,91.66667
7,male,group B,some college,free/reduced,none,40,43,39,40.66667
8,male,group D,high school,free/reduced,completed,64,64,67,65.0
9,female,group B,high school,free/reduced,none,38,60,50,49.33333


# 8. The value_counts()

In [122]:
df_exams['gender'].value_counts()

gender
female    518
male      482
Name: count, dtype: int64

In [123]:
df_exams['gender'].value_counts(normalize=True)

gender
female    0.518
male      0.482
Name: proportion, dtype: float64

In [124]:
# parental level of education
df_exams['parental level of education'].value_counts()

parental level of education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

In [125]:
df_exams['parental level of education'].value_counts(normalize=True)

parental level of education
some college          0.226
associate's degree    0.222
high school           0.196
some high school      0.179
bachelor's degree     0.118
master's degree       0.059
Name: proportion, dtype: float64

# 9. Sort the DataFrame

In [127]:
# sort by one column
df_exams.sort_values(by = 'math score')

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average
59,female,group C,some high school,free/reduced,none,0,17,10,9.0
980,female,group B,high school,free/reduced,none,8,24,23,18.333333
17,female,group B,some high school,free/reduced,none,18,32,28,26.0
787,female,group B,some college,standard,none,19,38,32,29.666667
145,female,group C,some college,free/reduced,none,22,39,33,31.333333
842,female,group B,high school,free/reduced,completed,23,44,36,34.333333
338,female,group B,some high school,free/reduced,none,24,38,27,29.666667
466,female,group D,associate's degree,free/reduced,none,26,31,38,31.666667
91,male,group C,high school,free/reduced,none,27,34,36,32.333333
363,female,group D,some high school,free/reduced,none,27,34,32,31.0


In [128]:
# sort DESC by one column
df_exams.sort_values(by='math score', ascending=False)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average
962,female,group E,associate's degree,standard,none,100,100,100,100.0
625,male,group D,some college,standard,completed,100,97,99,98.666667
458,female,group E,bachelor's degree,standard,none,100,100,100,100.0
623,male,group A,some college,standard,completed,100,96,86,94.0
451,female,group E,some college,standard,none,100,92,97,96.333333
149,male,group E,associate's degree,free/reduced,completed,100,100,93,97.666667
916,male,group E,bachelor's degree,standard,completed,100,100,100,100.0
263,female,group E,high school,standard,none,99,93,90,94.0
306,male,group E,some college,standard,completed,99,87,81,89.0
114,female,group E,bachelor's degree,standard,completed,99,100,100,99.666667


In [129]:
# sort DESC by multiple column
df_exams.sort_values(['math score', 'reading score'], ascending=False)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average
149,male,group E,associate's degree,free/reduced,completed,100,100,93,97.666667
458,female,group E,bachelor's degree,standard,none,100,100,100,100.0
916,male,group E,bachelor's degree,standard,completed,100,100,100,100.0
962,female,group E,associate's degree,standard,none,100,100,100,100.0
625,male,group D,some college,standard,completed,100,97,99,98.666667
623,male,group A,some college,standard,completed,100,96,86,94.0
451,female,group E,some college,standard,none,100,92,97,96.333333
114,female,group E,bachelor's degree,standard,completed,99,100,100,99.666667
263,female,group E,high school,standard,none,99,93,90,94.0
306,male,group E,some college,standard,completed,99,87,81,89.0


In [131]:
df_exams.sort_values('race/ethnicity')

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average
778,female,group A,some college,standard,completed,72,79,82,77.666667
546,female,group A,some high school,standard,completed,92,100,97,96.333333
820,female,group A,some high school,standard,completed,85,90,92,89.0
741,female,group A,associate's degree,free/reduced,none,37,57,56,50.0
305,male,group A,some college,standard,none,69,67,69,68.333333
62,male,group A,associate's degree,free/reduced,none,62,61,55,59.333333
61,male,group A,some high school,free/reduced,none,39,39,34,37.333333
972,female,group A,high school,free/reduced,completed,53,50,60,54.333333
112,male,group A,associate's degree,standard,none,54,53,47,51.333333
300,male,group A,some college,free/reduced,completed,81,78,81,80.0


# 10. The set_index() and sort_index()

## 10.1 Create index

In [132]:
import random

In [133]:
new_index = np.arange(0, 1000)

In [134]:
random.shuffle(new_index)

In [135]:
df_exams['new index'] = new_index

In [136]:
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average,new index
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667,208
1,female,group C,some college,standard,completed,69,90,88,82.333333,468
2,female,group B,master's degree,standard,none,90,95,93,92.666667,247
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333,940
4,male,group C,some college,standard,none,76,78,75,76.333333,53
5,female,group B,associate's degree,standard,none,71,83,78,77.333333,306
6,female,group B,some college,standard,completed,88,95,92,91.666667,174
7,male,group B,some college,free/reduced,none,40,43,39,40.666667,173
8,male,group D,high school,free/reduced,completed,64,64,67,65.0,197
9,female,group B,high school,free/reduced,none,38,60,50,49.333333,958


In [137]:
len(new_index)

1000

## 10.2 set the index

In [138]:
df_exams.set_index('new index')

Unnamed: 0_level_0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average
new index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
208,female,group B,bachelor's degree,standard,none,72,72,74,72.666667
468,female,group C,some college,standard,completed,69,90,88,82.333333
247,female,group B,master's degree,standard,none,90,95,93,92.666667
940,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333
53,male,group C,some college,standard,none,76,78,75,76.333333
306,female,group B,associate's degree,standard,none,71,83,78,77.333333
174,female,group B,some college,standard,completed,88,95,92,91.666667
173,male,group B,some college,free/reduced,none,40,43,39,40.666667
197,male,group D,high school,free/reduced,completed,64,64,67,65.0
958,female,group B,high school,free/reduced,none,38,60,50,49.333333


In [141]:
# sort by new index
df_exams.sort_index()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average,new index
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667,208
1,female,group C,some college,standard,completed,69,90,88,82.333333,468
2,female,group B,master's degree,standard,none,90,95,93,92.666667,247
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333,940
4,male,group C,some college,standard,none,76,78,75,76.333333,53
5,female,group B,associate's degree,standard,none,71,83,78,77.333333,306
6,female,group B,some college,standard,completed,88,95,92,91.666667,174
7,male,group B,some college,free/reduced,none,40,43,39,40.666667,173
8,male,group D,high school,free/reduced,completed,64,64,67,65.0,197
9,female,group B,high school,free/reduced,none,38,60,50,49.333333,958


In [142]:
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average,new index
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667,208
1,female,group C,some college,standard,completed,69,90,88,82.333333,468
2,female,group B,master's degree,standard,none,90,95,93,92.666667,247
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333,940
4,male,group C,some college,standard,none,76,78,75,76.333333,53
5,female,group B,associate's degree,standard,none,71,83,78,77.333333,306
6,female,group B,some college,standard,completed,88,95,92,91.666667,174
7,male,group B,some college,free/reduced,none,40,43,39,40.666667,173
8,male,group D,high school,free/reduced,completed,64,64,67,65.0,197
9,female,group B,high school,free/reduced,none,38,60,50,49.333333,958


# 11. Rename indexes and Columns with rename

In [144]:
df_exams.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average,new index
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667,208
1,female,group C,some college,standard,completed,69,90,88,82.333333,468
2,female,group B,master's degree,standard,none,90,95,93,92.666667,247
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333,940
4,male,group C,some college,standard,none,76,78,75,76.333333,53


In [146]:
df_exams.rename(columns={'gender': 'Gender'}).head()

Unnamed: 0,Gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average,new index
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667,208
1,female,group C,some college,standard,completed,69,90,88,82.333333,468
2,female,group B,master's degree,standard,none,90,95,93,92.666667,247
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333,940
4,male,group C,some college,standard,none,76,78,75,76.333333,53


In [154]:
df_exams.rename(columns={'math score': 'MS', 'reading score': 'RS', 'writing score': 'WS'}, inplace=True)
df_exams.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,MS,RS,WS,average,new index
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667,208
1,female,group C,some college,standard,completed,69,90,88,82.333333,468
2,female,group B,master's degree,standard,none,90,95,93,92.666667,247
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333,940
4,male,group C,some college,standard,none,76,78,75,76.333333,53


In [155]:
df_exams.rename(columns={'parental level of education': 'PLE'}).head()

Unnamed: 0,gender,race/ethnicity,PLE,lunch,test preparation course,MS,RS,WS,average,new index
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667,208
1,female,group C,some college,standard,completed,69,90,88,82.333333,468
2,female,group B,master's degree,standard,none,90,95,93,92.666667,247
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333,940
4,male,group C,some college,standard,none,76,78,75,76.333333,53
