In [None]:
import pandas as pd
import numpy as np

In [None]:
students = pd.read_csv('students.csv')
nov = pd.read_csv('reg-month1.csv')
dec = pd.read_csv('reg-month2.csv')
courses = pd.read_csv('courses.csv')

In [None]:
students.head(3)

Unnamed: 0,student_id,name,partner
0,1,Kailash Harjo,23
1,2,Esha Butala,1
2,3,Parveen Bhalla,3


In [None]:
courses.head(3)

Unnamed: 0,course_id,course_name,price
0,1,python,2499
1,2,sql,3499
2,3,data analysis,4999


In [None]:
nov.head(3)

Unnamed: 0,student_id,course_id
0,23,1
1,15,5
2,18,6


In [None]:
dec.head(3)

Unnamed: 0,student_id,course_id
0,3,5
1,16,7
2,12,10


#pd.concat

In [None]:
'''
pd.concat is just the vertical stacking of the 2 dfs. it is done when the 2 dfs are meant to be in separate dataframes but in the same
dataframe. both the dfs have same columns, suggesting that they both are 2 parts of the same table. hence, we just vertically stack them
to form one common table. in other words, you may understand it as a union operation between 2 tables to form 1 table.

use case:-
nov and dec are 2 tables having same type of content (same columns) suggesting that they are 2 parts of same table, hence we can
merge them to form a single table.
'''
pd.concat([nov,dec])

Unnamed: 0,student_id,course_id
0,23,1
1,15,5
2,18,6
3,23,4
4,16,9
5,18,1
6,1,1
7,7,8
8,22,3
9,15,1


In [None]:
'''
but if we simply do pd.concat([nov,dec]), the concat operation will simply put one table below the other, everything in both the tables
remaining untouched. but here's the problem. even the indices are untouched, so after indices of nov from 0-24, the indices of dec from
0-27 starts. if we want to have something at index 0, 2 values are going to get fetched and it will be very difficult to identify them
uniquely.
'''
pd.concat([nov,dec]).loc[0]

Unnamed: 0,student_id,course_id
0,23,1
0,3,5


In [None]:
'''
for unique identification of each entry after concant, we do:-
now, we can uniquely identify each row as when accessing through an index, we will get just 1 row instead of 2 rows having same index
previously.
'''
regs = pd.concat([nov,dec], ignore_index=True)
regs

Unnamed: 0,student_id,course_id
0,23,1
1,15,5
2,18,6
3,23,4
4,16,9
5,18,1
6,1,1
7,7,8
8,22,3
9,15,1


#multi-index df (intro)

In [None]:
'''
above, we are able to uniquely identify the rows, but what if we want to know whether it is coming from nov or dec?
'''
df = pd.concat([nov,dec], keys= ['Nov','Dec'])
#get student 5 details from nov:-
df.loc['Nov',5] #loc[index1, index2] or loc[filter, sub-filter]

Unnamed: 0_level_0,Nov
Unnamed: 0_level_1,5
student_id,18
course_id,1


#joins:-
1. inner
2. left
3. right
4. full

syntax:-
df.merge(another_df, how ='join_type', on='col')

In [None]:
# inner join
students.merge(regs,how='inner',on='student_id')

Unnamed: 0,student_id,name,partner,course_id
0,1,Kailash Harjo,23,1
1,1,Kailash Harjo,23,6
2,1,Kailash Harjo,23,10
3,1,Kailash Harjo,23,9
4,2,Esha Butala,1,5
5,3,Parveen Bhalla,3,3
6,3,Parveen Bhalla,3,5
7,7,Tarun Thaker,9,8
8,7,Tarun Thaker,9,10
9,7,Tarun Thaker,9,7


In [None]:
# left join
courses.merge(regs,how='left',on='course_id')

Unnamed: 0,course_id,course_name,price,student_id
0,1,python,2499,23.0
1,1,python,2499,18.0
2,1,python,2499,1.0
3,1,python,2499,15.0
4,1,python,2499,21.0
5,1,python,2499,25.0
6,1,python,2499,12.0
7,1,python,2499,14.0
8,1,python,2499,38.0
9,2,sql,3499,19.0


In [None]:
#right join
students.merge(regs,how='right',on='student_id')

Unnamed: 0,student_id,name,partner,course_id
0,23,Chhavi Lachman,18.0,1
1,15,Preet Sha,16.0,5
2,18,Fardeen Mahabir,13.0,6
3,23,Chhavi Lachman,18.0,4
4,16,Elias Dodiya,25.0,9
5,18,Fardeen Mahabir,13.0,1
6,1,Kailash Harjo,23.0,1
7,7,Tarun Thaker,9.0,8
8,22,Yash Sethi,21.0,3
9,15,Preet Sha,16.0,1


In [None]:
# outer join
students.merge(regs,how='outer',on='student_id').tail(10)

Unnamed: 0,student_id,name,partner,course_id
50,23,Chhavi Lachman,18.0,3.0
51,23,Chhavi Lachman,18.0,6.0
52,23,Chhavi Lachman,18.0,9.0
53,23,Chhavi Lachman,18.0,5.0
54,24,Radhika Suri,17.0,4.0
55,25,Shashank D’Alia,2.0,1.0
56,25,Shashank D’Alia,2.0,10.0
57,38,,,1.0
58,42,,,9.0
59,50,,,8.0


#practice questions

In [None]:
# 1. find total revenue generated
df1 = courses.merge(regs,how='left',on='course_id')
df1['price'].sum()

np.int64(156245)

In [None]:
# 2. find month by month revenue
regs = pd.concat([nov,dec],keys=['nov','dec']).reset_index()
regs
regs.merge(courses, on='course_id').groupby('level_0').sum(numeric_only=True)['price']

Unnamed: 0_level_0,price
level_0,Unnamed: 1_level_1
dec,65072
nov,89175


In [None]:
# 3. find students who enrolled in both the months
'''
after doing groupby, when do we get a df and when a series?
NOTE:- any column selected in the groupby() becomes index/indices of df or series depending on the following:-
1. when df?
when we do:-
mask = df.groupby(['student_id', 'level_0']).count()
this is grouping + aggregation "WITHOUT" selecting a column.
student_id and level_0 became indices of the df.

2. when series?
when we do:-
mask = df.groupby(['student_id', 'level_0'])['level_0'].count()
this is grouping + aggregation "WITH" selecting a single column then obviously we are going to get a series only.
student_id and level_0 became indices of the series.

where columns inside the groupby become index/indices and when you apply a method, the values
obtained from method becomes the value column/s of the series/df.
but when you reset the index, the student_id and level_0 become columns again from indices, and the value column is given a name called
cnt.
'''

'''
set operations:-
np.union1d()        ------> for merging everyting
np.intersect1d()    ------> for finding common items
np.setdiff1d()      ------> for finding items which are in left table and not in right table.
'''

common_student_list = np.intersect1d(nov['student_id'], dec['student_id'])
df_new = students[students['student_id'].isin(common_student_list)]
df_new

Unnamed: 0,student_id,name,partner
0,1,Kailash Harjo,23
2,3,Parveen Bhalla,3
6,7,Tarun Thaker,9
10,11,David Mukhopadhyay,20
15,16,Elias Dodiya,25
16,17,Yasmin Palan,7
17,18,Fardeen Mahabir,13
21,22,Yash Sethi,21
22,23,Chhavi Lachman,18


In [None]:
# 4. find course that got no enrollment
'''
regs contains students and corresponding course_id of courses registered.
courses contains all courses and corresponding ids.
we simply do a set diff between course_id of (courses and regs) which will omit all common course_ids of courses and regs and will just
keep the course_ids of courses which weren't common.
'''
regs = pd.concat([nov,dec])
regs
courses_not_reg_list = np.setdiff1d(courses['course_id'],regs['course_id'] )
courses[courses['course_id'].isin(courses_not_reg_list)]

Unnamed: 0,course_id,course_name,price
10,11,Numpy,699
11,12,C++,1299


In [None]:
# 5. find students who did not enroll into any courses
student_not_reg_list = np.setdiff1d(students['student_id'],regs['student_id'])
students[students['student_id'].isin(student_not_reg_list)]#.shape[0]*100/students.shape[0]

28.0

In [None]:
# 6. Print student name -> partner name for all enrolled students
'''
when the columns names to be joined are different, left_on and right_on to be used.
'''
students.merge(students, how = 'inner', left_on = 'partner', right_on = 'student_id').head(3)

Unnamed: 0,student_id_x,name_x,partner_x,student_id_y,name_y,partner_y
0,1,Kailash Harjo,23,23,Chhavi Lachman,18
1,2,Esha Butala,1,1,Kailash Harjo,23
2,3,Parveen Bhalla,3,3,Parveen Bhalla,3


In [None]:
# 7. find top 3 students who did most number enrollments
mask = regs.groupby('student_id')['course_id'].count().reset_index().head(3)
mask.merge(students, how = 'left', on = 'student_id')['name']

Unnamed: 0,name
0,Kailash Harjo
1,Esha Butala
2,Parveen Bhalla


In [None]:
# 8. find top 3 student names who spent most amount of money on courses
mask = regs.merge(courses, how = 'left', on = 'course_id').groupby('student_id')['price'].sum().sort_values(ascending=False).head(3).reset_index()
mask.merge(students, how = 'left', on='student_id')['name']

Unnamed: 0,name
0,Chhavi Lachman
1,Pranab Natarajan
2,Qabeel Raman
