In [2]:
import pandas as pd

students = pd.read_csv("students.csv")
teachers = pd.read_csv("teachers.csv")
grades1 = pd.read_csv("grades1.csv")
grades2 = pd.read_csv("grades2.csv")
contacts = pd.read_csv("contact.csv")

students.head(2)

Unnamed: 0,id,firstname,lastname
0,34292,Joshua,Davis
1,34293,Karen,Flores


In [4]:
teachers.head(2)

Unnamed: 0,course,teacher
0,PHYS101,Wayne Walker
1,CHEM101,Albert Taylor


In [5]:
grades1.head(2)

Unnamed: 0,student_id,course,grade
0,34292,ENGL101,A
1,34293,ENGL101,A


In [6]:
grades2.head(2)

Unnamed: 0,student_id,course,grade
0,34302,DRAM101,F
1,34303,DRAM101,D


In [7]:
contacts.head(2)

Unnamed: 0,student_id,parent_contact,phone
0,34292,Samuel Davis,(356) 849-0352
1,34293,Laura Flores,(477) 325-7117


## Concat and Append

In [13]:
'''
add a semester column to each of the grades 1 and 2
and then concatenate them
'''
grades1["Semester"] = 1
grades2["Semester"] = 2
grades = pd.concat([grades1,grades2])
grades

Unnamed: 0,student_id,course,grade,Semester
0,34292,ENGL101,A,1
1,34293,ENGL101,A,1
2,34294,ENGL101,C,1
3,34295,ENGL101,C,1
4,34296,ENGL101,C,1
...,...,...,...,...
60,34312,DESN101,E,2
61,34313,DESN101,D,2
62,34314,DESN101,A,2
63,34315,DESN101,A,2


In [9]:
'''
can also use append function since they have the same colums
'''
appended = grades1.copy().append(grades2)
appended

Unnamed: 0,student_id,course,grade,Semester
0,34292,ENGL101,A,1
1,34293,ENGL101,A,1
2,34294,ENGL101,C,1
3,34295,ENGL101,C,1
4,34296,ENGL101,C,1
...,...,...,...,...
60,34312,DESN101,E,2
61,34313,DESN101,D,2
62,34314,DESN101,A,2
63,34315,DESN101,A,2


In [14]:
'''
merged ds has repeating indices so need to reset the index column
the drop parameter will ensure the old index column is dropped
'''
grades = grades.reset_index(drop=True)
grades

Unnamed: 0,student_id,course,grade,Semester
0,34292,ENGL101,A,1
1,34293,ENGL101,A,1
2,34294,ENGL101,C,1
3,34295,ENGL101,C,1
4,34296,ENGL101,C,1
...,...,...,...,...
185,34312,DESN101,E,2
186,34313,DESN101,D,2
187,34314,DESN101,A,2
188,34315,DESN101,A,2


## Merging Horizonally

In [24]:
'''
left_on/right_on identify the primary and foreign key columns
if the names of the columns are the same then just use the "on" param
'''
student_grades = pd.merge(students, grades, left_on="id", right_on="student_id")
student_grades

Unnamed: 0,id,firstname,lastname,student_id,course,grade,Semester
0,34292,Joshua,Davis,34292,ENGL101,A,1
1,34292,Joshua,Davis,34292,MATH101,A,1
2,34292,Joshua,Davis,34292,PHYS101,D,1
3,34292,Joshua,Davis,34292,MATH102,F,1
4,34292,Joshua,Davis,34292,COMP101,A,1
...,...,...,...,...,...,...,...
185,34316,Stephen,Martinez,34316,ARTT101,A,1
186,34316,Stephen,Martinez,34316,DESN101,A,1
187,34316,Stephen,Martinez,34316,DRAM101,F,2
188,34316,Stephen,Martinez,34316,ARTT101,A,2


In [17]:
'''
verify the shapes of the source and merged df's to make sure no data
was lost
'''
print(students.shape, grades.shape, student_grades.shape)

(25, 3) (190, 4) (190, 7)


In [21]:
'''
merge the students and contacts tables
rename the id column in the contacts table first
'''
students2 = students.rename({"id": "student_id"}, axis=1)
students_full = students2.merge(contacts, on="student_id")
students_full

Unnamed: 0,student_id,firstname,lastname,parent_contact,phone
0,34292,Joshua,Davis,Samuel Davis,(356) 849-0352
1,34293,Karen,Flores,Laura Flores,(477) 325-7117
2,34294,Julia,Walker,Eric Walker,(871) 639-0797
3,34295,Cynthia,Robinson,Chris Robinson,(574) 683-2107
4,34296,Bonnie,Hall,Dorothy Hall,(384) 293-2113
5,34297,Gary Lee,Lee,Mary Lee,(914) 452-6366
6,34298,Ruby,Thompson,Jose Thompson,(253) 419-7702
7,34299,Heather,Miller,Annie Miller,(864) 583-9615
8,34300,Mary,Mitchell,Edward Mitchell,(494) 471-1604
9,34301,Carlos,Ross,Debra Ross,(592) 663-4425


In [25]:
students_grades_full = students_full.merge(student_grades, on="student_id")
students_grades_full

print(students_grades_full.shape, student_grades.shape, students_full.shape)

(190, 11) (190, 7) (25, 5)


In [26]:
students_grades_full

Unnamed: 0,student_id,firstname_x,lastname_x,parent_contact,phone,id,firstname_y,lastname_y,course,grade,Semester
0,34292,Joshua,Davis,Samuel Davis,(356) 849-0352,34292,Joshua,Davis,ENGL101,A,1
1,34292,Joshua,Davis,Samuel Davis,(356) 849-0352,34292,Joshua,Davis,MATH101,A,1
2,34292,Joshua,Davis,Samuel Davis,(356) 849-0352,34292,Joshua,Davis,PHYS101,D,1
3,34292,Joshua,Davis,Samuel Davis,(356) 849-0352,34292,Joshua,Davis,MATH102,F,1
4,34292,Joshua,Davis,Samuel Davis,(356) 849-0352,34292,Joshua,Davis,COMP101,A,1
...,...,...,...,...,...,...,...,...,...,...,...
185,34316,Stephen,Martinez,Norma Martinez,(783) 439-7854,34316,Stephen,Martinez,ARTT101,A,1
186,34316,Stephen,Martinez,Norma Martinez,(783) 439-7854,34316,Stephen,Martinez,DESN101,A,1
187,34316,Stephen,Martinez,Norma Martinez,(783) 439-7854,34316,Stephen,Martinez,DRAM101,F,2
188,34316,Stephen,Martinez,Norma Martinez,(783) 439-7854,34316,Stephen,Martinez,ARTT101,A,2


In [27]:
full = students_grades_full.merge(teachers, on="course")

In [44]:
full.head()


Unnamed: 0,student_id,firstname_x,lastname_x,parent_contact,phone,id,firstname_y,lastname_y,course,grade,Semester,teacher
0,34292,Joshua,Davis,Samuel Davis,(356) 849-0352,34292,Joshua,Davis,ENGL101,A,1,Ernest Green
1,34293,Karen,Flores,Laura Flores,(477) 325-7117,34293,Karen,Flores,ENGL101,A,1,Ernest Green
2,34294,Julia,Walker,Eric Walker,(871) 639-0797,34294,Julia,Walker,ENGL101,C,1,Ernest Green
3,34295,Cynthia,Robinson,Chris Robinson,(574) 683-2107,34295,Cynthia,Robinson,ENGL101,C,1,Ernest Green
4,34296,Bonnie,Hall,Dorothy Hall,(384) 293-2113,34296,Bonnie,Hall,ENGL101,C,1,Ernest Green


In [58]:
'''
get counts of student "F"'s' using two methods
'''
df2 = full.loc[full.grade == "F", ["student_id"]]

method1 = df2["student_id"].value_counts().rename_axis("student_id").reset_index(name="counts")
method1

Unnamed: 0,student_id,counts
0,34300,4
1,34302,3
2,34303,2
3,34309,2
4,34311,2
5,34313,2
6,34316,2
7,34292,2
8,34294,2
9,34299,2


In [69]:
'''
second method with groupby
'''
method2 = full.loc[full.grade == "F", ["student_id", "grade"]].groupby("student_id").count().reset_index()
method2.columns = ["student_id", "counts"]
method2.head(50)

Unnamed: 0,student_id,counts
0,34292,2
1,34293,1
2,34294,2
3,34297,1
4,34298,1
5,34299,2
6,34300,4
7,34301,2
8,34302,3
9,34303,2


In [75]:
'''
extract the studentid for rows with count >= 3
for use as a mask
'''
sids = method1[method1.counts >= 3].student_id
sids.head()

0    34300
1    34302
Name: student_id, dtype: int64

In [77]:
'''
finally filter the students ds using the mask
'''

final = students_full[students_full.student_id.isin(sids)]
final

Unnamed: 0,student_id,firstname,lastname,parent_contact,phone
8,34300,Mary,Mitchell,Edward Mitchell,(494) 471-1604
10,34302,Dorothy,Green,Russell Green,(749) 344-5744
