In [1]:
# dependencies
import pandas as pd
import numpy as np
import os

In [2]:
#load school csv
cityschools = os.path.join("schools_complete.csv")

In [3]:
#read cityschools with pandas
cityschools_df = pd.read_csv(cityschools)
cityschools_df.head()

Unnamed: 0,School ID,name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [4]:
total_budget = cityschools_df['budget'].sum()
print(total_budget)


24649428


In [5]:
#load student csv
citystudents = os.path.join("students_complete.csv")

In [6]:
#read citystudents with pandas
citystudents_df = pd.read_csv(citystudents)
citystudents_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [7]:
#change the school column header in cityschools
cityschools_df = cityschools_df.rename(columns={"name": "school"})
cityschools_df.head()

Unnamed: 0,School ID,school,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [8]:
#merget tables
schoolsdata_df = pd.merge(citystudents_df, cityschools_df, on="school")
schoolsdata_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [9]:
# #
# **District Summary**

# * Create a high level snapshot (in table form) of the district's key metrics, including:
#   * Total Schools
total_schools = schoolsdata_df['school'].nunique()
print(total_schools)

15


In [10]:
#   * Total Students
total_students=len(schoolsdata_df.axes[0])
print(total_students)

39170


In [11]:
#   * Average Math Score
district_average_math = schoolsdata_df['math_score'].mean()
print(district_average_math)

78.98537145774827


In [12]:
#   * Average Reading Score
district_average_reading = schoolsdata_df['reading_score'].mean()
print(district_average_reading)

81.87784018381414


In [13]:
#  * % Passing Math
bins = [1, 60, 100]
score = [False, True]


In [14]:
pd.cut(schoolsdata_df['math_score'], bins, labels=score)
schoolsdata_df['passing_math'] = pd.cut(schoolsdata_df['math_score'], bins, labels=score)
schoolsdata_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score,School ID,type,size,budget,passing_math
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,True
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,True
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,False
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,False
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,True


In [15]:
schoolsdata_df['passing_math'] = schoolsdata_df['passing_math'] == True
mpass_district = schoolsdata_df['passing_math'].sum()
print(mpass_district)
                 

35608


In [16]:
pmpass_district = mpass_district / total_students * 100
print(pmpass_district)

90.9063058463


In [17]:
#   * % Passing Reading
Bins = [1, 60, 100]
Score = [False, True]


In [18]:
pd.cut(schoolsdata_df['reading_score'], bins, labels=score)
schoolsdata_df['passing_reading'] = pd.cut(schoolsdata_df['reading_score'], bins, labels=score)
schoolsdata_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score,School ID,type,size,budget,passing_math,passing_reading
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,True,True
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,True,True
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,False,True
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,False,True
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,True,True


In [19]:
schoolsdata_df['passing_reading'] = schoolsdata_df['passing_reading'] == True
rpass_district = schoolsdata_df['passing_reading'].sum()
print(rpass_district)

39170


In [20]:
prpass_district = rpass_district / total_students * 100
print(prpass_district)

100.0


In [21]:
#   * Overall Passing Rate (Average of the above two)
district_passing = pmpass_district / prpass_district * 100
print(district_passing)

90.9063058463


In [22]:
district_summary = pd.DataFrame({'Total Schools': [total_schools],
                                 'Total Students': [total_students], 
                                 'Total Budget': [total_budget], 
                                 'Average Math Score': [district_average_math],
                                 'Average Reading Score': [district_average_reading],
                                 '% Passing Math': [pmpass_district],
                                 '% Passing Reading': [rpass_district],
                                 '% Overall Passing Rate': [district_passing]
})
district_summary = district_summary[['Total Schools',
                                    'Total Students',
                                    'Total Budget',
                                    'Average Math Score',
                                    'Average Reading Score',
                                    '% Passing Math',
                                    '% Passing Reading',
                                    '% Overall Passing Rate']]
district_summary = district_summary.round(2)

district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,39170,24649428,78.99,81.88,90.91,39170,90.91


In [23]:
schooltype_df = (schoolsdata_df['type']. groupby(schoolsdata_df['school'])).describe()


In [24]:
schooltype_df.reset_index(inplace=True)


In [25]:
schooltype_df = schooltype_df.rename(columns={"count": "Total Students", "top": "School Type"})


In [26]:
#   * Total School Budget
schoolbudget_df = (schoolsdata_df['budget']. groupby(schoolsdata_df['school'])).describe()



In [27]:
schoolbudget_df.reset_index(inplace=True)


In [28]:
df1_df = pd.merge(schooltype_df, schoolbudget_df, on='school') 


In [29]:
df1_df.reset_index(inplace=False)


Unnamed: 0,index,school,Total Students,unique,School Type,freq,count,mean,std,min,25%,50%,75%,max
0,0,Bailey High School,4976,1,District,4976,4976.0,3124928.0,0.0,3124928.0,3124928.0,3124928.0,3124928.0,3124928.0
1,1,Cabrera High School,1858,1,Charter,1858,1858.0,1081356.0,0.0,1081356.0,1081356.0,1081356.0,1081356.0,1081356.0
2,2,Figueroa High School,2949,1,District,2949,2949.0,1884411.0,0.0,1884411.0,1884411.0,1884411.0,1884411.0,1884411.0
3,3,Ford High School,2739,1,District,2739,2739.0,1763916.0,0.0,1763916.0,1763916.0,1763916.0,1763916.0,1763916.0
4,4,Griffin High School,1468,1,Charter,1468,1468.0,917500.0,0.0,917500.0,917500.0,917500.0,917500.0,917500.0
5,5,Hernandez High School,4635,1,District,4635,4635.0,3022020.0,0.0,3022020.0,3022020.0,3022020.0,3022020.0,3022020.0
6,6,Holden High School,427,1,Charter,427,427.0,248087.0,0.0,248087.0,248087.0,248087.0,248087.0,248087.0
7,7,Huang High School,2917,1,District,2917,2917.0,1910635.0,0.0,1910635.0,1910635.0,1910635.0,1910635.0,1910635.0
8,8,Johnson High School,4761,1,District,4761,4761.0,3094650.0,0.0,3094650.0,3094650.0,3094650.0,3094650.0,3094650.0
9,9,Pena High School,962,1,Charter,962,962.0,585858.0,0.0,585858.0,585858.0,585858.0,585858.0,585858.0


In [30]:
schoolsdata_df.passing_math = schoolsdata_df.passing_math.astype(int)

In [31]:
#   * % Passing Math
schoolmathpass_df = (schoolsdata_df['passing_math']. groupby(schoolsdata_df['school'])).mean().to_frame()
schoolmathpass_df.reset_index(inplace=True)
schoolmathpass_df.head()

Unnamed: 0,school,passing_math
0,Bailey High School,0.874397
1,Cabrera High School,1.0
2,Figueroa High School,0.864361
3,Ford High School,0.872216
4,Griffin High School,1.0


In [32]:
#schoolmathpass_df = schoolmathpass_df.rename(columns={"count": "mathcountpass","unique": "math", "top": "passing_math", "freq": "frequency"})
#schoolmathpass_df.head()

In [33]:
df2_df = pd.merge(df1_df, schoolmathpass_df, on="school")


In [34]:
schoolsdata_df.passing_reading = schoolsdata_df.passing_reading.astype(int)

In [35]:
#   * % Passing Reading
schoolreadpass_df = (schoolsdata_df['passing_reading']. groupby(schoolsdata_df['school'])).mean().to_frame()
schoolreadpass_df.reset_index(inplace=True)
schoolreadpass_df.head() 

Unnamed: 0,school,passing_reading
0,Bailey High School,1
1,Cabrera High School,1
2,Figueroa High School,1
3,Ford High School,1
4,Griffin High School,1


In [36]:
#schoolreadpass_df = schoolreadpass_df.rename(columns={"count": "readcountpass","unique": "read", "top": "passing_reading", "freq": "frequen"})


In [37]:
schoolsum_df = pd.merge(df2_df, schoolreadpass_df, on='school')
schoolsum_df.head()

Unnamed: 0,school,Total Students,unique,School Type,freq,count,mean,std,min,25%,50%,75%,max,passing_math,passing_reading
0,Bailey High School,4976,1,District,4976,4976.0,3124928.0,0.0,3124928.0,3124928.0,3124928.0,3124928.0,3124928.0,0.874397,1
1,Cabrera High School,1858,1,Charter,1858,1858.0,1081356.0,0.0,1081356.0,1081356.0,1081356.0,1081356.0,1081356.0,1.0,1
2,Figueroa High School,2949,1,District,2949,2949.0,1884411.0,0.0,1884411.0,1884411.0,1884411.0,1884411.0,1884411.0,0.864361,1
3,Ford High School,2739,1,District,2739,2739.0,1763916.0,0.0,1763916.0,1763916.0,1763916.0,1763916.0,1763916.0,0.872216,1
4,Griffin High School,1468,1,Charter,1468,1468.0,917500.0,0.0,917500.0,917500.0,917500.0,917500.0,917500.0,1.0,1


In [38]:
schoolsummary_df = schoolsum_df[["school", "School Type", "Total Students", "mean", "passing_math", "passing_reading"]]


In [39]:
#   * Average Math Score
schoolmathavg = (schoolsdata_df['math_score']. groupby(schoolsdata_df['school'])).mean().to_frame()
schoolmathavg.reset_index(inplace=True)


In [40]:
#   * Average Reading Score
schoolreadingavg = (schoolsdata_df['reading_score']. groupby(schoolsdata_df['school'])).mean().to_frame()
schoolreadingavg.reset_index(inplace=True)


In [41]:
averagescores_df = pd.merge(schoolmathavg, schoolreadingavg, on="school")


In [42]:
school_summary_df = pd.merge(schoolsummary_df, averagescores_df, on="school")
school_summary_df.head()

Unnamed: 0,school,School Type,Total Students,mean,passing_math,passing_reading,math_score,reading_score
0,Bailey High School,District,4976,3124928.0,0.874397,1,77.048432,81.033963
1,Cabrera High School,Charter,1858,1081356.0,1.0,1,83.061895,83.97578
2,Figueroa High School,District,2949,1884411.0,0.864361,1,76.711767,81.15802
3,Ford High School,District,2739,1763916.0,0.872216,1,77.102592,80.746258
4,Griffin High School,Charter,1468,917500.0,1.0,1,83.351499,83.816757


In [43]:
#   * Per student Budget
school_summary_df["Per Student Budget"] = school_summary_df["mean"]/school_summary_df["Total Students"]
school_summary_df.head()

Unnamed: 0,school,School Type,Total Students,mean,passing_math,passing_reading,math_score,reading_score,Per Student Budget
0,Bailey High School,District,4976,3124928.0,0.874397,1,77.048432,81.033963,628
1,Cabrera High School,Charter,1858,1081356.0,1.0,1,83.061895,83.97578,582
2,Figueroa High School,District,2949,1884411.0,0.864361,1,76.711767,81.15802,639
3,Ford High School,District,2739,1763916.0,0.872216,1,77.102592,80.746258,644
4,Griffin High School,Charter,1468,917500.0,1.0,1,83.351499,83.816757,625


In [44]:
#   * Overall Passing Rate (Average of the above two)
school_summary_df["% Overall Passing Rate"] = (school_summary_df["passing_math"] + school_summary_df["passing_reading"])/2
school_summary_df.head()

Unnamed: 0,school,School Type,Total Students,mean,passing_math,passing_reading,math_score,reading_score,Per Student Budget,% Overall Passing Rate
0,Bailey High School,District,4976,3124928.0,0.874397,1,77.048432,81.033963,628,0.937199
1,Cabrera High School,Charter,1858,1081356.0,1.0,1,83.061895,83.97578,582,1.0
2,Figueroa High School,District,2949,1884411.0,0.864361,1,76.711767,81.15802,639,0.93218
3,Ford High School,District,2739,1763916.0,0.872216,1,77.102592,80.746258,644,0.936108
4,Griffin High School,Charter,1468,917500.0,1.0,1,83.351499,83.816757,625,1.0


In [45]:
school_summary_df = school_summary_df.rename(columns={"mean": "Total School Budget", "passing_math": "% Passing Math", "passing_reading": "% Passing Reading", "math_score": "Average Math Score", "reading_score": "Average Reading Score"})
school_summary_df = school_summary_df[["school", "School Type", "Total Students", "Total School Budget", "Per Student Budget", "Average Math Score", "Average Reading Score", "% Passing Math", "% Passing Reading", "% Overall Passing Rate"]]
school_summary_df.head()

Unnamed: 0,school,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,Bailey High School,District,4976,3124928.0,628,77.048432,81.033963,0.874397,1,0.937199
1,Cabrera High School,Charter,1858,1081356.0,582,83.061895,83.97578,1.0,1,1.0
2,Figueroa High School,District,2949,1884411.0,639,76.711767,81.15802,0.864361,1,0.93218
3,Ford High School,District,2739,1763916.0,644,77.102592,80.746258,0.872216,1,0.936108
4,Griffin High School,Charter,1468,917500.0,625,83.351499,83.816757,1.0,1,1.0


In [46]:
school_summary_df['Total School Budget'] = school_summary_df['Total School Budget'].map("${:,.0f}".format)
school_summary_df['Per Student Budget'] = school_summary_df['Per Student Budget'].map("${:,.0f}".format)
school_summary_df.head()



Unnamed: 0,school,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,Bailey High School,District,4976,"$3,124,928",$628,77.048432,81.033963,0.874397,1,0.937199
1,Cabrera High School,Charter,1858,"$1,081,356",$582,83.061895,83.97578,1.0,1,1.0
2,Figueroa High School,District,2949,"$1,884,411",$639,76.711767,81.15802,0.864361,1,0.93218
3,Ford High School,District,2739,"$1,763,916",$644,77.102592,80.746258,0.872216,1,0.936108
4,Griffin High School,Charter,1468,"$917,500",$625,83.351499,83.816757,1.0,1,1.0


In [47]:
school_summary_df['% Passing Math'] = school_summary_df['% Passing Math'].map("{:.2%}".format)
school_summary_df['% Passing Reading'] = school_summary_df['% Passing Reading'].map("{:.2%}".format)
school_summary_df['% Overall Passing Rate'] = school_summary_df['% Overall Passing Rate'].map("{:.2%}".format)
school_summary_df.head()

Unnamed: 0,school,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,Bailey High School,District,4976,"$3,124,928",$628,77.048432,81.033963,87.44%,100.00%,93.72%
1,Cabrera High School,Charter,1858,"$1,081,356",$582,83.061895,83.97578,100.00%,100.00%,100.00%
2,Figueroa High School,District,2949,"$1,884,411",$639,76.711767,81.15802,86.44%,100.00%,93.22%
3,Ford High School,District,2739,"$1,763,916",$644,77.102592,80.746258,87.22%,100.00%,93.61%
4,Griffin High School,Charter,1468,"$917,500",$625,83.351499,83.816757,100.00%,100.00%,100.00%


In [48]:
#school_summary_df.set_index('school')

In [49]:
# **Top Performing Schools (By Passing Rate)**

# * Create a table that highlights the top 5 performing schools based on Overall Passing Rate. Include:
#   * School Name
#   * School Type
#   * Total Students
#   * Total School Budget
#   * Per School Budget
#   * Average Math Score
#   * Average Reading Score
#   * % Passing Math
#   * % Passing Reading
#   * Overall Passing Rate (Average of the above two)

school_summary_df['% Overall Passing Rate'] = school_summary_df['% Overall Passing Rate'].replace('%','',regex=True).astype('float')
#school_summary_df['% Overall Passing Rate'] = school_summary_df['% Overall Passing Rate'].map("{:.2%}".format)
school_summary_df.head()

Unnamed: 0,school,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,Bailey High School,District,4976,"$3,124,928",$628,77.048432,81.033963,87.44%,100.00%,93.72
1,Cabrera High School,Charter,1858,"$1,081,356",$582,83.061895,83.97578,100.00%,100.00%,100.0
2,Figueroa High School,District,2949,"$1,884,411",$639,76.711767,81.15802,86.44%,100.00%,93.22
3,Ford High School,District,2739,"$1,763,916",$644,77.102592,80.746258,87.22%,100.00%,93.61
4,Griffin High School,Charter,1468,"$917,500",$625,83.351499,83.816757,100.00%,100.00%,100.0


In [50]:
school_summary_df.sort_values(by='% Overall Passing Rate', ascending=True)
school_summary_df.head()

Unnamed: 0,school,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,Bailey High School,District,4976,"$3,124,928",$628,77.048432,81.033963,87.44%,100.00%,93.72
1,Cabrera High School,Charter,1858,"$1,081,356",$582,83.061895,83.97578,100.00%,100.00%,100.0
2,Figueroa High School,District,2949,"$1,884,411",$639,76.711767,81.15802,86.44%,100.00%,93.22
3,Ford High School,District,2739,"$1,763,916",$644,77.102592,80.746258,87.22%,100.00%,93.61
4,Griffin High School,Charter,1468,"$917,500",$625,83.351499,83.816757,100.00%,100.00%,100.0


In [51]:
#school_summary_df['% Overall Passing Rate'] = school_summary_df['% Overall Passing Rate'].map("{:.2%}".format)


In [52]:
# **Top Performing Schools (By Passing Rate)**

# * Create a table that highlights the bottom 5 performing schools based on Overall Passing Rate. Include all of the same metrics as aboveschool_summary_df.sort_values(by='% Overall Passing Rate', ascending=Fal;)
school_summary_df.sort_values(by='% Overall Passing Rate', ascending=False)
school_summary_df.head()

Unnamed: 0,school,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,Bailey High School,District,4976,"$3,124,928",$628,77.048432,81.033963,87.44%,100.00%,93.72
1,Cabrera High School,Charter,1858,"$1,081,356",$582,83.061895,83.97578,100.00%,100.00%,100.0
2,Figueroa High School,District,2949,"$1,884,411",$639,76.711767,81.15802,86.44%,100.00%,93.22
3,Ford High School,District,2739,"$1,763,916",$644,77.102592,80.746258,87.22%,100.00%,93.61
4,Griffin High School,Charter,1468,"$917,500",$625,83.351499,83.816757,100.00%,100.00%,100.0


In [53]:
#change the school column header in cityschools

schoolsdata_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score,School ID,type,size,budget,passing_math,passing_reading
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,1,1
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,1,1
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,0,1
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,0,1
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,1,1


In [54]:
#schoolsdata_df['grade'] = schoolsdata_df['grade'].replace('th','',regex=True).astype('float')

In [62]:
# **Math Scores by Grade**

# * Create a table that lists the average Math Score for students of each grade level (9th, 10th, 11th, 12th) at each school.
schoolmathscores = schoolsdata_df.groupby(["school", "grade"])["math_score"].mean()
#schoolmathscores({'Average Math Score' : schoolmathscores.groupby( [ "school", "grade"] ).size()}).reset_index()
schoolmathscores.head()

school               grade
Bailey High School   10th     76.996772
                     11th     77.515588
                     12th     76.492218
                     9th      77.083676
Cabrera High School  10th     83.154506
Name: math_score, dtype: float64

In [59]:
# **Reading Scores by Grade**

# * Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.
schoolreadingscores = schoolsdata_df.groupby(["school", "grade"])["reading_score"].mean()
schoolreadingscores.head()

school               grade
Bailey High School   10th     80.907183
                     11th     80.945643
                     12th     80.912451
                     9th      81.303155
Cabrera High School  10th     84.253219
Name: reading_score, dtype: float64

In [None]:
# **Scores by School Spending**

# * Create a table that breaks down school performances based on average Spending Ranges (Per Student). Use 4 reasonable bins to group school spending. Include in the table each of the following:
#   * Average Math Score
#   * Average Reading Score
#   * % Passing Math
#   * % Passing Reading
#   * Overall Passing Rate (Average of the above two)

In [None]:
# **Scores by School Size**

# * Repeat the above breakdown, but this time group schools based on a reasonable approximation of school size (Small, Medium, Large).

In [None]:
# **Scores by School Type**

# * Repeat the above breakdown, but this time group schools based on school type (Charter vs. District).

In [None]:
# As final considerations:

# * Your script must work for both data-sets given.
# * You must use the Pandas Library and the Jupyter Notebook.
# * You must submit a link to your Jupyter Notebook with the viewable Data Frames. 
# * You must include an exported markdown version of your Notebook called  `README.md` in your GitHub repository.  
# * You must include a written description of three observable trends based on the data. 
# * See [Example Solution](PyCitySchools/PyCitySchools_Example.pdf) for a reference on the expected format. 

In [None]:
# ## Hints and Considerations

# * These are challenging activities for a number of reasons. For one, these activities will require you to analyze thousands of records. Hacking through the data to look for obvious trends in Excel is just not a feasible option. The size of the data may seem daunting, but Python Pandas will allow you to efficiently parse through it. 

# * Second, these activities will also challenge you by requiring you to learn on your feet. Don't fool yourself into thinking: "I need to study Pandas more closely before diving in." Get the basic gist of the library and then _immediately_ get to work. When facing a daunting task, it's easy to think: "I'm just not ready to tackle it yet." But that's the surest way to never succeed. Learning to program requires one to constantly tinker, experiment, and learn on the fly. You are doing exactly the _right_ thing, if you find yourself constantly practicing Google-Fu and diving into documentation. There is just no way (or reason) to try and memorize it all. Online references are available for you to use when you need them. So use them!

# * Take each of these tasks one at a time. Begin your work, answering the basic questions: "How do I import the data?" "How do I convert the data into a DataFrame?" "How do I build the first table?" Don't get intimidated by the number of asks. Many of them are repetitive in nature with just a few tweaks. Be persistent and creative!

# * Expect these exercises to take time! Don't get discouraged if you find yourself spending  hours initially with little progress. Force yourself to deal with the discomfort of not knowing and forge ahead. This exercise is likely to take between 15-30 hours of your time. Consider these hours an investment in your future!

# * As always, feel encouraged to work in groups and get help from your TAs and Instructor. Just remember, true success comes from mastery and _not_ a completed homework assignment. So challenge yourself to truly succeed!