In [None]:
# Import pandas
import pandas as pd

# Read in filename and set the index: election
election = pd.read_csv(filename, index_col ='county')

# Create a separate dataframe with the columns ['winner', 'total', 'voters']: results
results = election[['winner', 'total', 'voters']]

# Print the output of results.head()
print(results.head())


In [None]:
# Slice the columns from the starting column to 'Obama': left_columns
left_columns = election.loc[:,'state':'Obama']

# Print the output of left_columns.head()
print(left_columns.head())

# Slice the columns from 'Obama' to 'winner': middle_columns
middle_columns = election.loc[:,'Obama':'winner']

# Print the output of middle_columns.head()
print(middle_columns.head())

# Slice the columns from 'Romney' to the end: 'right_columns'
right_columns = election.loc[:,'Romney':]

# Print the output of right_columns.head()
print(right_columns.head())


In [None]:
# Create the list of row labels: rows
rows = ['Philadelphia', 'Centre', 'Fulton']

# Create the list of column labels: cols
cols = ['winner', 'Obama', 'Romney']

# Create the new DataFrame: three_counties
three_counties = election.loc[rows,cols]

# Print the three_counties DataFrame
print(three_counties)


In [None]:
# Create the boolean array: high_turnout
high_turnout = election.turnout > 70

# Filter the election DataFrame with the high_turnout array: high_turnout_df
high_turnout_df = election[high_turnout]

# Print the high_turnout_results DataFrame
print(high_turnout_df)


In [None]:
# Import numpy
import numpy as np

# Create the boolean array: too_close
too_close = election.margin < 1

# Assign np.nan to the 'winner' column where the results were too close to call
election.winner[too_close] = np.nan

# Print the output of election.info()
print(election.info())

In [None]:
# In certain scenarios, it may be necessary to remove rows and columns with
# missing data from a DataFrame. The .dropna() method is used to perform this action. 

# Select the 'age' and 'cabin' columns: df
df = titanic.loc[:,['age','cabin']]

# Print the shape of df
print(df.shape)

# Drop rows in df with how='any' and print the shape
print(df.dropna(how = 'any').shape)

# Drop rows in df with how='all' and print the shape
print(df.dropna(how = 'all').shape)

# Call .dropna() with thresh=1000 and axis='columns' and print the output of .info() from titanic
print(titanic.dropna(thresh=1000, axis='columns').info())



In [3]:
# The .apply() method can be used on a pandas DataFrame to apply an arbitrary Python function to every element. 

# Write a function to convert degrees Fahrenheit to degrees Celsius: to_celsius
def to_celsius(F):
    return 5/9*(F - 32)

# Apply the function over 'Mean TemperatureF' and 'Mean Dew PointF': df_celsius
df_celsius = weather.loc[:,['Mean TemperatureF', 'Mean Dew PointF']].apply(to_celsius)

# Reassign the columns df_celsius
df_celsius.columns = ['Mean TemperatureC', 'Mean Dew PointC']

# Print the output of df_celsius.head()
print(df_celsius.head())


2

In [None]:
# The .map() method is used to transform values according to a Python dictionary look-up.
# Create the dictionary: red_vs_blue
red_vs_blue = {'Obama':'blue', 'Romney':'red'}

# Use the dictionary to map the 'winner' column to the new column: election['color']
election['color'] = election.winner.map(red_vs_blue)

# Print the output of election.head()
print(election.head())


In [None]:
# When performance is paramount, you should avoid using .apply() and .map()
# because those constructs perform Python for-loops over the data stored in 
# a pandas Series or DataFrame. By using vectorized functions instead, you
# can loop over the data at the same speed as compiled code (C, Fortran, etc.)!
# NumPy, SciPy and pandas come with a variety of vectorized functions
# (called Universal Functions or UFuncs in NumPy).

# You can even write your own vectorized functions, but for now we will focus
# on the ones distributed by NumPy and pandas.

# In this exercise you're going to import the zscore method from scipy.stats and
# use it to compute the deviation in voter turnout in Pennsylvania from the mean
# in fractions of the standard deviation. In statistics, the z-score is the number
# of standard deviations by which an observation is above the mean - so if it is
# negative, it means the observation is below the mean.

# Instead of using .apply() as you did in the earlier exercises, the zscore UFunc
# will take a pandas Series as input and return a NumPy array. You will then assign
# the values of the NumPy array to a new column in the DataFrame. 


# Import zscore from scipy.stats
from scipy.stats import zscore

# Call zscore with election['turnout'] as input: turnout_zscore
turnout_zscore = zscore(election['turnout'])

# Print the type of turnout_zscore
print(type(turnout_zscore))

# Assign turnout_zscore to a new column: election['turnout_zscore']
election['turnout_zscore'] = turnout_zscore

# Print the output of election.head()
print(election.head())


In [None]:
# Indexes are immutable objects. This means that if you want to change 
# or modify the index in a DataFrame, then you need to change the whole index. 

# A list comprehension is a succinct way to generate a list in one line.
# For example, the following list comprehension generates a list that contains
# the cubes of all numbers from 0 to 9: 
cubes = [i**3 for i in range(10)].
#This is equivalent to the following code:

cubes = []
for i in range(10):
    cubes.append(i**3)
    
# Create the list of new indexes: new_idx
new_idx = [i.upper() for i in sales.index]

# Assign new_idx to sales.index
sales.index = new_idx

# Print the sales DataFrame
print(sales)


In [4]:
# Assign the string 'MONTHS' to sales.index.name
sales.index.name = 'MONTHS'

# Print the sales DataFrame
print(sales)

# Assign the string 'PRODUCTS' to sales.columns.name 
sales.columns.name = 'PRODUCTS'

# Print the sales dataframe again
print(sales)


[0, 1, 8, 27, 64, 125, 216, 343, 512, 729]

In [None]:
# You can also build the DataFrame and index independently, and then put them
# together. If you take this route, be careful, as any mistakes in generating
# the DataFrame or the index can cause the data and the index to be aligned incorrectly.

# Generate the list of months: months
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']

# Assign months to sales.index
sales.index = months

# Print the modified sales DataFrame
print(sales)


In [None]:
# Print sales.loc[['CA', 'TX']]
print(sales.loc[['CA','TX']])

# Print sales['CA':'TX']
print(sales['CA':'TX'])


In [None]:
# With a MultiIndex, you should always ensure the index is sorted. You can
# skip this only if you know the data is already sorted on the index fields.

# Set the index to be the columns ['state', 'month']: sales
sales = sales.set_index(['state','month'])

# Sort the MultiIndex: sales
sales = sales.sort_index()

# Print the sales DataFrame
print(sales)

In [None]:
# Set the index to the column 'state': sales
sales = sales.set_index('state')

# Print the sales DataFrame
print(sales)

# Access the data from 'NY'
print(sales.loc['NY'])


In [None]:
# Looking up data based on inner levels of a MultiIndex can be a bit trickier.
# In this exercise, you will use your sales DataFrame to do some increasingly complex lookups.

# The trickiest of all these lookups are when you want to access some inner levels of the
# index. In this case, you need to use slice(None) in the slicing parameter for the outermost
# dimension(s) instead of the usual :, or use pd.IndexSlice. You can refer to the pandas
# documentation for more details. For example, in the video, Dhavide used the following
# code to extract rows from all Symbols for the dates Oct. 3rd through 4th inclusive:

# stocks.loc[(slice(None), slice('2016-10-03', '2016-10-04')), :]

# Look up data for NY in month 1: NY_month1
NY_month1 = sales.loc[('NY', 1), :]

# Look up data for CA and TX in month 2: CA_TX_month2
CA_TX_month2 = sales.loc[(['CA', 'TX'],2),:]

# Look up data for all states in month 2: all_month2
all_month2 = sales.loc[(slice(None), 2),:]

In [None]:
# Pivot the users DataFrame: visitors_pivot
visitors_pivot = users.pivot(index = 'weekday', columns = 'city', values = 'visitors')

# Print the pivoted DataFrame
print(visitors_pivot)

''' print(users)
  weekday    city  visitors  signups
0     Sun  Austin       139        7
1     Sun  Dallas       237       12
2     Mon  Austin       326        3
3     Mon  Dallas       456        5

<script.py> output:
    city     Austin  Dallas
    weekday                
    Mon         326     456
    Sun         139     237'
'''



In [None]:
# If you do not select any particular variables, all of them will be pivoted. 
# In this case - with the users DataFrame - both 'visitors' and 'signups' will
# be pivoted, creating hierarchical column labels.

# Pivot users with signups indexed by weekday and city: signups_pivot
signups_pivot = users.pivot(index = 'weekday', columns = 'city',values= 'signups')

# Print signups_pivot
print(signups_pivot)

# Pivot users pivoted by both signups and visitors: pivot
pivot = users.pivot(index = 'weekday', columns = 'city')

# Print the pivoted DataFrame
print(pivot)


'''
 users
Out[7]: 
  weekday    city  visitors  signups
0     Sun  Austin       139        7
1     Sun  Dallas       237       12
2     Mon  Austin       326        3
3     Mon  Dallas       456        5

<script.py> output:
    city     Austin  Dallas
    weekday                
    Mon           3       5
    Sun           7      12
            visitors        signups       
    city      Austin Dallas  Austin Dallas
    weekday                               
    Mon          326    456       3      5
    Sun          139    237       7     12

'''

In [None]:
# Unstack users by 'weekday': byweekday
byweekday = users.unstack(level = 'weekday')

# Print the byweekday DataFrame
print(byweekday)

# Stack byweekday by 'weekday' and print it
print(byweekday.stack(level = 'weekday'))

'''
<script.py> output:
            visitors      signups    
    weekday      Mon  Sun     Mon Sun
    city                             
    Austin       326  139       3   7
    Dallas       456  237       5  12
                    visitors  signups
    city   weekday                   
    Austin Mon           326        3
           Sun           139        7
    Dallas Mon           456        5
           Sun           237       12
'''

In [None]:
# Unstack users by 'city': bycity
bycity = users.unstack(level = 'city')

# Print the bycity DataFrame
print(bycity)

# Stack bycity by 'city' and print it
print(bycity.stack(level = 'city'))

'''
<script.py> output:
            visitors        signups       
    city      Austin Dallas  Austin Dallas
    weekday                               
    Mon          326    456       3      5
    Sun          139    237       7     12
                    visitors  signups
    weekday city                     
    Mon     Austin       326        3
            Dallas       456        5
    Sun     Austin       139        7
            Dallas       237       12
'''

In [None]:
# Continuing from the previous exercise, you will now use .swaplevel(0, 1) to flip
# the index levels. Note they won't be sorted. To sort them, you will have to follow
# up with a .sort_index(). You will then obtain the original DataFrame. Note that an
# unsorted index leads to slicing failures.

# Stack 'city' back into the index of bycity: newusers
newusers = bycity.stack(level = 'city')

# Swap the levels of the index of newusers: newusers
newusers = newusers.swaplevel(0,1)

# Print newusers and verify that the index is not sorted
print(newusers)

# Sort the index of newusers: newusers
newusers = newusers.sort_index()

# Print newusers and verify that the index is now sorted
print(newusers)

# Verify that the new DataFrame is equal to the original
print(newusers.equals(users))



'''
<script.py> output:
                    visitors  signups
    city   weekday                   
    Austin Mon           326        3
    Dallas Mon           456        5
    Austin Sun           139        7
    Dallas Sun           237       12
                    visitors  signups
    city   weekday                   
    Austin Mon           326        3
           Sun           139        7
    Dallas Mon           456        5
           Sun           237       12
    True
'''

In [None]:
# the goal of melting is to restore a pivoted DataFrame to its original form,
# or to change it from a wide shape to a long shape. You can explicitly specify
# the columns that should remain in the reshaped DataFrame with id_vars, and
# list which columns to convert into values with value_vars. As Dhavide demonstrated,
# if you don't pass a name to the values in pd.melt(), you will lose the name of your
# variable. You can fix this by using the value_name keyword argument.

'''
In [1]: visitors_by_city_weekday
Out[1]: 
city     Austin  Dallas
weekday                
Mon         326     456
Sun         139     237
'''


# Reset the index: visitors_by_city_weekday
visitors_by_city_weekday = visitors_by_city_weekday.reset_index() 

# Print visitors_by_city_weekday
print(visitors_by_city_weekday)

# Melt visitors_by_city_weekday: visitors
visitors = pd.melt(visitors_by_city_weekday, id_vars=['weekday'], value_name= 'visitors')

# Print visitors
print(visitors)

'''
<script.py> output:
    city weekday  Austin  Dallas
    0        Mon     326     456
    1        Sun     139     237
      weekday    city  visitors
    0     Mon  Austin       326
    1     Sun  Austin       139
    2     Mon  Dallas       456
    3     Sun  Dallas       237
'''

In [None]:
'''
In [1]: users
Out[1]: 
  weekday    city  visitors  signups
0     Sun  Austin       139        7
1     Sun  Dallas       237       12
2     Mon  Austin       326        3
3     Mon  Dallas       456        5
'''

# You can move multiple columns into a single column (making the data long and skinny) by "melting" multiple columns. 

# Melt users: skinny
skinny = pd.melt(users, id_vars = ['weekday','city'])

# Print skinny
print(skinny)

'''
<script.py> output:
      weekday    city  variable  value
    0     Sun  Austin  visitors    139
    1     Sun  Dallas  visitors    237
    2     Mon  Austin  visitors    326
    3     Mon  Dallas  visitors    456
    4     Sun  Austin   signups      7
    5     Sun  Dallas   signups     12
    6     Mon  Austin   signups      3
    7     Mon  Dallas   signups      5

'''

In [None]:
# Sometimes, all you need is some key-value pairs, and the context does not matter.
# If said context is in the index, you can easily obtain what you want. For example,
# in the users DataFrame, the visitors and signups columns lend themselves well to being
# represented as key-value pairs. So if you created a hierarchical index with 'city' and
# 'weekday' columns as the index, you can easily extract key-value pairs for the 'visitors'
# and 'signups' columns by melting users and specifying col_level=0

# Set the new index: users_idx
users_idx = users.set_index(['city', 'weekday'])

# Print the users_idx DataFrame
print(users_idx)

# Obtain the key-value pairs: kv_pairs
kv_pairs = pd.melt(users_idx, col_level = 0)

# Print the key-value pairs
print(kv_pairs)


'''
<script.py> output:
                    visitors  signups
    city   weekday                   
    Austin Sun           139        7
    Dallas Sun           237       12
    Austin Mon           326        3
    Dallas Mon           456        5
       variable  value
    0  visitors    139
    1  visitors    237
    2  visitors    326
    3  visitors    456
    4   signups      7
    5   signups     12
    6   signups      3
    7   signups      5
'''

In [None]:
# a pivot table allows you to see all of your variables as a function of two other variables.
# In this exercise, you will use the .pivot_table() method to see how the users DataFrame
# entries appear when presented as functions of the 'weekday' and 'city' columns. That is,
# with the rows indexed by 'weekday' and the columns indexed by 'city'.

'''
In [1]: users
Out[1]: 
  weekday    city  visitors  signups
0     Sun  Austin       139        7
1     Sun  Dallas       237       12
2     Mon  Austin       326        3
3     Mon  Dallas       456        5
'''

# Create the DataFrame with the appropriate pivot table: by_city_day
by_city_day = users.pivot_table(index = 'weekday', columns = 'city')

# Print by_city_day
print(by_city_day)


'''
<script.py> output:
            visitors        signups       
    city      Austin Dallas  Austin Dallas
    weekday                               
    Mon          326    456       3      5
    Sun          139    237       7     12
'''


In [None]:
# You can also use aggregation functions with in a pivot table by specifying the aggfunc parameter.

'''
In [1]: users
Out[1]: 
  weekday    city  visitors  signups
0     Sun  Austin       139        7
1     Sun  Dallas       237       12
2     Mon  Austin       326        3
3     Mon  Dallas       456        5
'''

# Use a pivot table to display the count of each column: count_by_weekday1
count_by_weekday1 = users.pivot_table(index = 'weekday', aggfunc = 'count')

# Print count_by_weekday
print(count_by_weekday1)

# Replace 'aggfunc='count'' with 'aggfunc=len': count_by_weekday2
count_by_weekday2 = users.pivot_table(index = 'weekday', aggfunc = len)

# Verify that the same result is obtained
print('==========================================')
print(count_by_weekday1.equals(count_by_weekday2))


'''
<script.py> output:
            visitors        signups       
    city      Austin Dallas  Austin Dallas
    weekday                               
    Mon            1      1       1      1
    Sun            1      1       1      1
    ==========================================
    True

<script.py> output:
             city  signups  visitors
    weekday                         
    Mon         2        2         2
    Sun         2        2         2
    ==========================================
    True
'''

In [None]:
# Sometimes it's useful to add totals in the margins of a pivot table. You can do this with the argument margins=True

'''
In [1]: users
Out[1]: 
  weekday    city  visitors  signups
0     Sun  Austin       139        7
1     Sun  Dallas       237       12
2     Mon  Austin       326        3
3     Mon  Dallas       456        5
'''

# Create the DataFrame with the appropriate pivot table: signups_and_visitors
signups_and_visitors = users.pivot_table(index = 'weekday', aggfunc = sum)

# Print signups_and_visitors
print(signups_and_visitors)

# Add in the margins: signups_and_visitors_total 
signups_and_visitors_total = users.pivot_table(index = 'weekday', aggfunc = sum, margins = True)

# Print signups_and_visitors_total
print(signups_and_visitors_total)


'''
<script.py> output:
             signups  visitors
    weekday                   
    Mon            8       782
    Sun           19       376
             signups  visitors
    weekday                   
    Mon          8.0     782.0
    Sun         19.0     376.0
    All         27.0    1158.0

'''

In [None]:
# Group titanic by 'pclass'
by_class = titanic.groupby('pclass')

# Aggregate 'survived' column of by_class by count
count_by_class = by_class['survived'].count()

# Print count_by_class
print(count_by_class)

# Group titanic by 'embarked' and 'pclass'
by_mult = titanic.groupby(['embarked','pclass'])

# Aggregate 'survived' column of by_mult by count
count_mult = by_mult['survived'].count()

# Print count_mult
print(count_mult)


'''
<script.py> output:
    pclass
    1    323
    2    277
    3    709
    Name: survived, dtype: int64
    embarked  pclass
    C         1         141
              2          28
              3         101
    Q         1           3
              2           7
              3         113
    S         1         177
              2         242
              3         495
    Name: survived, dtype: int64
'''

In [None]:
'''
In [1]: life_fname
Out[1]: 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1650/datasets/life_expectancy.csv'

'''

'''
In [4]: regions_fname
Out[4]: 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1650/datasets/regions.csv'
'''


# Read life_fname into a DataFrame: life
life = pd.read_csv(life_fname, index_col='Country')

# Read regions_fname into a DataFrame: regions
regions = pd.read_csv(regions_fname, index_col = 'Country')

# Group life by regions['region']: life_by_region
life_by_region = life.groupby(regions.region)

# Print the mean over the '2010' column of life_by_region
print(life_by_region['2010'].mean())

'''
<script.py> output:
    region
    America                       74.037350
    East Asia & Pacific           73.405750
    Europe & Central Asia         75.656387
    Middle East & North Africa    72.805333
    South Asia                    68.189750
    Sub-Saharan Africa            57.575080
    Name: 2010, dtype: float64

'''

In [None]:
# The .agg() method can be used with a tuple or list of aggregations as input. 
# When applying multiple aggregations on multiple columns, the aggregated DataFrame has a multi-level column index.

# Group titanic by 'pclass': by_class
by_class = titanic.groupby('pclass')

# Select 'age' and 'fare'
by_class_sub = by_class[['age','fare']]

# Aggregate by_class_sub by 'max' and 'median': aggregated
aggregated = by_class_sub.agg(['max', 'median'])

# Print the maximum age in each class
print(aggregated.loc[:, ('age','max')])

# Print the median fare in each class
print(aggregated.loc[:,('fare', 'median')])

'''
<script.py> output:
    pclass
    1    80.0
    2    70.0
    3    74.0
    Name: (age, max), dtype: float64
    pclass
    1    60.0000
    2    15.0458
    3     8.0500
    Name: (fare, median), dtype: float64
'''

In [None]:
# If you have a DataFrame with a multi-level row index, the individual levels can
# be used to perform the groupby. This allows advanced aggregation techniques to be
# applied along one or more levels in the index and across one or more columns.

# Read the CSV file into a DataFrame and sort the index: gapminder
gapminder = pd.read_csv('gapminder.csv', index_col = ['Year', 'region', 'Country']).sort_index()

# Group gapminder by 'Year' and 'region': by_year_region
by_year_region = gapminder.groupby(level = ['Year', 'region'])

# Define the function to compute spread: spread
def spread(series):
    return series.max() - series.min()

# Create the dictionary: aggregator
aggregator = {'population':'sum', 'child_mortality':'mean', 'gdp':spread}

# Aggregate by_year_region using the dictionary: aggregated
aggregated = by_year_region.agg(aggregator)

# Print the last 6 entries of aggregated 
print(aggregated.tail(6))


'''
<script.py> output:
                                          gdp  child_mortality    population
    Year region                                                             
    2013 America                      49634.0        17.745833  9.629087e+08
         East Asia & Pacific         134744.0        22.285714  2.244209e+09
         Europe & Central Asia        86418.0         9.831875  8.968788e+08
         Middle East & North Africa  128676.0        20.221500  4.030504e+08
         South Asia                   11469.0        46.287500  1.701241e+09
         Sub-Saharan Africa           32035.0        76.944490  9.205996e+08

'''

In [None]:
# Groubpy operations can also be performed on transformations of the index values.
# In the case of a DateTimeIndex, we can extract portions of the datetime over which to group.

# Is there a day of the week that is more popular for customers? To find out, you're going to use .strftime('%a')
# to transform the index datetime values to abbreviated days of the week.

# Read file: sales
sales = pd.read_csv('sales.csv', index_col = 'Date', parse_dates = True)

# Create a groupby object: by_day
by_day = sales.groupby(sales.index.strftime('%a'))

# Create sum: units_sum
units_sum = by_day['Units'].sum()

# Print units_sum
print(units_sum)


'''
<script.py> output:
    Mon    48
    Sat     7
    Thu    59
    Tue    13
    Wed    48
    Name: Units, dtype: int64
'''

In [None]:
# you can apply a .transform() method after grouping to apply a function
# to groups of data independently. The z-score is also useful to find outliers:
# a z-score value of +/- 3 is generally considered to be an outlier.

# Import zscore
from scipy.stats import zscore

# Group gapminder_2010: standardized
standardized = gapminder_2010.groupby('region')['life','fertility'].transform(zscore)

# Construct a Boolean Series to identify outliers: outliers
outliers = (standardized['life'] < -3) | (standardized['fertility'] > 3)

# Filter gapminder_2010 by the outliers: gm_outliers
gm_outliers = gapminder_2010.loc[outliers]

# Print gm_outliers
print(gm_outliers)

'''
<script.py> output:
                 fertility    life  population  child_mortality     gdp  \
    Country                                                               
    Guatemala        3.974  71.100  14388929.0             34.5  6849.0   
    Haiti            3.350  45.000   9993247.0            208.8  1518.0   
    Tajikistan       3.780  66.830   6878637.0             52.6  2110.0   
    Timor-Leste      6.237  65.952   1124355.0             63.8  1777.0   
    
                                region  
    Country                             
    Guatemala                  America  
    Haiti                      America  
    Tajikistan   Europe & Central Asia  
    Timor-Leste    East Asia & Pacific
'''


In [None]:
# Many statistical and machine learning packages cannot determine the best action
# to take when missing data entries are encountered. Dealing with missing data is
# natural in pandas (both in using the default behavior and in defining a custom behavior).
# In Chapter 1, you practiced using the .dropna() method to drop missing values. Now,
# you will practice imputing missing values. You can use .groupby() and .transform()
# to fill missing data appropriately for each group.

# Create a groupby object: by_sex_class
by_sex_class = titanic.groupby(['sex', 'pclass'])

# Write a function that imputes median
def impute_median(series):
    return series.fillna(series.median())

# Impute age and assign to titanic['age']
titanic.age = by_sex_class['age'].transform(impute_median)

# Print the output of titanic.tail(10)
print(titanic.tail(10))


'''
<script.py> output:
          pclass  survived                                     name     sex   age  \
    1299       3         0                      Yasbeck, Mr. Antoni    male  27.0   
    1300       3         1  Yasbeck, Mrs. Antoni (Selini Alexander)  female  15.0   
    1301       3         0                     Youseff, Mr. Gerious    male  45.5   
    1302       3         0                        Yousif, Mr. Wazli    male  25.0   
    1303       3         0                    Yousseff, Mr. Gerious    male  25.0   
    1304       3         0                     Zabour, Miss. Hileni  female  14.5   
    1305       3         0                    Zabour, Miss. Thamine  female  22.0   
    1306       3         0                Zakarian, Mr. Mapriededer    male  26.5   
    1307       3         0                      Zakarian, Mr. Ortin    male  27.0   
    1308       3         0                       Zimmerman, Mr. Leo    male  29.0   
    
          sibsp  parch  ticket     fare cabin embarked boat   body home.dest  
    1299      1      0    2659  14.4542   NaN        C    C    NaN       NaN  
    1300      1      0    2659  14.4542   NaN        C  NaN    NaN       NaN  
    1301      0      0    2628   7.2250   NaN        C  NaN  312.0       NaN  
    1302      0      0    2647   7.2250   NaN        C  NaN    NaN       NaN  
    1303      0      0    2627  14.4583   NaN        C  NaN    NaN       NaN  
    1304      1      0    2665  14.4542   NaN        C  NaN  328.0       NaN  
    1305      1      0    2665  14.4542   NaN        C  NaN    NaN       NaN  
    1306      0      0    2656   7.2250   NaN        C  NaN  304.0       NaN  
    1307      0      0    2670   7.2250   NaN        C  NaN    NaN       NaN  
    1308      0      0  315082   7.8750   NaN        S  NaN    NaN       NaN
'''

In [None]:
# The .apply() method when used on a groupby object performs an arbitrary function on 
# each of the groups. These functions can be aggregations, transformations or more complex
# workflows. The .apply() method will then combine the results in an intelligent way.

def disparity(gr):
    # Compute the spread of gr['gdp']: s
    s = gr['gdp'].max() - gr['gdp'].min()
    # Compute the z-score of gr['gdp'] as (gr['gdp']-gr['gdp'].mean())/gr['gdp'].std(): z
    z = (gr['gdp'] - gr['gdp'].mean())/gr['gdp'].std()
    # Return a DataFrame with the inputs {'z(gdp)':z, 'regional spread(gdp)':s}
    return pd.DataFrame({'z(gdp)':z , 'regional spread(gdp)':s})



In [None]:
# Group gapminder_2010 by 'region': regional
regional = gapminder_2010.groupby('region')

# Apply the disparity function on regional: reg_disp
reg_disp = regional.apply(disparity)

# Print the disparity of 'United States', 'United Kingdom', and 'China'
print(reg_disp.loc[['United States', 'United Kingdom', 'China'],:])


'''
<script.py> output:
                    regional spread(gdp)    z(gdp)
    Country                                       
    United States                47855.0  3.013374
    United Kingdom               89037.0  0.572873
    China                        96993.0 -0.432756
'''

In [None]:
# By using .apply(), you can write functions that filter rows within groups.
# The .apply() method will handle the iteration over individual groups and then
# re-combine them back into a Series or DataFrame.

# In this exercise you'll take the Titanic data set and analyze survival rates
# from the 'C' deck, which contained the most passengers. To do this you'll group
# the dataset by 'sex' and then use the .apply() method on a provided user defined
# function which calculates the mean survival rates on the 'C' deck:

def c_deck_survival(gr):

    c_passengers = gr['cabin'].str.startswith('C').fillna(False)

    return gr.loc[c_passengers, 'survived'].mean()

# Create a groupby object using titanic over the 'sex' column: by_sex
by_sex = titanic.groupby('sex')

# Call by_sex.apply with the function c_deck_survival and print the result
c_surv_by_sex = by_sex.apply(c_deck_survival)

# Print the survival rates
print(c_surv_by_sex)


'''
<script.py> output:
    sex
    female    0.913043
    male      0.312500
    dtype: float64
'''

In [None]:
# You can use groupby with the .filter() method to remove whole groups of rows
# from a DataFrame based on a boolean condition.

# In this exercise, you'll take the February sales data and remove entries from
# companies that purchased less than 35 Units in the whole month.

# First, you'll identify how many units each company bought for verification. Next
# you'll use the .filter() method after grouping by 'Company' to remove all rows
# belonging to companies whose sum over the 'Units' column was less than 35. Finally,
# verify that the three companies whose total Units purchased were less than 35 have
# been filtered out from the DataFrame.

# Read the CSV file into a DataFrame: sales
sales = pd.read_csv('sales.csv', index_col='Date', parse_dates=True)

# Group sales by 'Company': by_company
by_company = sales.groupby('Company')

# Compute the sum of the 'Units' of by_company: by_com_sum
by_com_sum = by_company['Units'].sum()
print(by_com_sum)

# Filter 'Units' where the sum is > 35: by_com_filt
by_com_filt = by_company.filter(lambda g:g['Units'].sum() > 35)
print(by_com_filt)



'''
<script.py> output:
    Company
    Acme Coporation    34
    Hooli              30
    Initech            30
    Mediacore          45
    Streeplex          36
    Name: Units, dtype: int64
                           Company   Product  Units
    Date                                           
    2015-02-02 21:00:00  Mediacore  Hardware      9
    2015-02-04 15:30:00  Streeplex  Software     13
    2015-02-09 09:00:00  Streeplex   Service     19
    2015-02-09 13:00:00  Mediacore  Software      7
    2015-02-19 11:00:00  Mediacore  Hardware     16
    2015-02-19 16:00:00  Mediacore   Service     10
    2015-02-21 05:00:00  Mediacore  Software      3
    2015-02-26 09:00:00  Streeplex   Service      4
'''

In [None]:
# You have seen how to group by a column, or by multiple columns. Sometimes,
# you may instead want to group by a function/transformation of a column. The
# key here is that the Series is indexed the same way as the DataFrame. You can
# also mix and match column grouping with Series grouping.

# In this exercise your job is to investigate survival rates of passengers on the
# Titanic by 'age' and 'pclass'. In particular, the goal is to find out what fraction
# of children under 10 survived in each 'pclass'. You'll do this by first creating a
# boolean array where True is passengers under 10 years old and False is passengers over
# 10. You'll use .map() to change these values to strings.

# Finally, you'll group by the under 10 series and the 'pclass' column and aggregate the
# 'survived' column. The 'survived' column has the value 1 if the passenger survived and
# 0 otherwise. The mean of the 'survived' column is the fraction of passengers who lived.

# Create the Boolean Series: under10
under10 = (titanic['age'] < 10).map({True :'under 10',False: 'over 10'})

# Group by under10 and compute the survival rate
survived_mean_1 = titanic.groupby(under10)['survived'].mean()
print(survived_mean_1)

# Group by under10 and pclass and compute the survival rate
survived_mean_2 = titanic.groupby([under10,'pclass'])['survived'].mean()
print(survived_mean_2)



'''
<script.py> output:
    age
    over 10     0.366748
    under 10    0.609756
    Name: survived, dtype: float64
    age       pclass
    over 10   1         0.617555
              2         0.380392
              3         0.238897
    under 10  1         0.750000
              2         1.000000
              3         0.446429
    Name: survived, dtype: float64
'''

In [None]:
# Notice that .value_counts() sorts by values by default. The result is returned as
# a Series of counts indexed by unique entries from the original Series with values
# (counts) ranked in descending order.

# Select the 'NOC' column of medals: country_names
country_names = medals['NOC']

# Count the number of medals won by each country: medal_counts
medal_counts = country_names.value_counts()

# Print top 15 countries ranked by medals
print(medal_counts.head(15))

'''
<script.py> output:
    USA    4335
    URS    2049
    GBR    1594
    FRA    1314
    ITA    1228
    GER    1211
    AUS    1075
    HUN    1053
    SWE    1021
    GDR     825
    NED     782
    JPN     704
    CHN     679
    RUS     638
    ROU     624
    Name: NOC, dtype: int64
'''

In [None]:
# Rather than ranking countries by total medals won and showing that list, you may
# want to see a bit more detail. You can use a pivot table to compute how many separate
# bronze, silver and gold medals each country won. That pivot table can then be used to
# repeat the previous computation to rank by total medals won.

# Construct the pivot table: counted
counted = medals.pivot_table(index = 'NOC', columns = 'Medal', values = 'Athlete',aggfunc= 'count')

# Create the new column: counted['totals']
counted['totals'] = counted.sum(axis = 'columns')

# Sort counted by the 'totals' column
counted = counted.sort_values('totals', ascending = False)

# Print the top 15 rows of counted
print(counted.head(15))


'''
<script.py> output:
    Medal  Bronze    Gold  Silver  totals
    NOC                                  
    USA    1052.0  2088.0  1195.0  4335.0
    URS     584.0   838.0   627.0  2049.0
    GBR     505.0   498.0   591.0  1594.0
    FRA     475.0   378.0   461.0  1314.0
    ITA     374.0   460.0   394.0  1228.0
    GER     454.0   407.0   350.0  1211.0
    AUS     413.0   293.0   369.0  1075.0
    HUN     345.0   400.0   308.0  1053.0
    SWE     325.0   347.0   349.0  1021.0
    GDR     225.0   329.0   271.0   825.0
    NED     320.0   212.0   250.0   782.0
    JPN     270.0   206.0   228.0   704.0
    CHN     193.0   234.0   252.0   679.0
    RUS     240.0   192.0   206.0   638.0
    ROU     282.0   155.0   187.0   624.0
'''

In [None]:
# Select columns: ev_gen
ev_gen = medals[['Event_gender','Gender']]

# Drop duplicate pairs: ev_gen_uniques
ev_gen_uniques = ev_gen.drop_duplicates()

# Print ev_gen_uniques
print(ev_gen_uniques)

'''
<script.py> output:
          Event_gender Gender
    0                M    Men
    348              X    Men
    416              W  Women
    639              X  Women
    23675            W    Men
'''

In [None]:
# Group medals by the two columns: medals_by_gender
medals_by_gender = medals.groupby(['Event_gender','Gender'])

# Create a DataFrame with a group count: medal_count_by_gender
medal_count_by_gender = medals_by_gender.count()

# Print medal_count_by_gender
print(medal_count_by_gender)

'''
                         Medal  
    Event_gender Gender         
    M            Men     20067  
    W            Men         1  
                 Women    7277  
    X            Men      1653  
                 Women     218
'''

In [None]:
# Create the Boolean Series: sus
sus = (medals.Event_gender == 'W') & (medals.Gender == 'Men')

# Create a DataFrame with the suspicious row: suspect
suspect = medals.loc[sus]

# Print suspect
print(suspect)


'''
<script.py> output:
             City  Edition      Sport Discipline            Athlete  NOC Gender  \
    23675  Sydney     2000  Athletics  Athletics  CHEPCHUMBA, Joyce  KEN    Men   
    
              Event Event_gender   Medal  
    23675  marathon            W  Bronze
'''

In [None]:
# You may want to know which countries won medals in the most distinct sports.
# The .nunique() method is the principal aggregation here. Given a categorical Series S,
# S.nunique() returns the number of distinct categories.

# Group medals by 'NOC': country_grouped
country_grouped = medals.groupby('NOC')

# Compute the number of distinct sports in which each country won medals: Nsports
Nsports = country_grouped['Sport'].nunique()

# Sort the values of Nsports in descending order
Nsports = Nsports.sort_values(ascending = False)

# Print the top 15 rows of Nsports
print(Nsports.head(15))

'''
<script.py> output:
    NOC
    USA    34
    GBR    31
    FRA    28
    GER    26
    CHN    24
    AUS    22
    ESP    22
    CAN    22
    SWE    21
    URS    21
    ITA    21
    NED    20
    RUS    20
    JPN    20
    DEN    19
    Name: Sport, dtype: int64
    
'''

In [None]:
# Extract all rows for which the 'Edition' is between 1952 & 1988: during_cold_war
during_cold_war = (medals.Edition >= 1952) & (medals.Edition <= 1988)

# Extract rows for which 'NOC' is either 'USA' or 'URS': is_usa_urs
is_usa_urs = medals.NOC.isin(['USA','URS'])

# Use during_cold_war and is_usa_urs to create the DataFrame: cold_war_medals
cold_war_medals = medals.loc[during_cold_war & is_usa_urs]

# Group cold_war_medals by 'NOC'
country_grouped = cold_war_medals.groupby('NOC')

# Create Nsports
Nsports = country_grouped.Sport.nunique().sort_values(ascending = False)

# Print Nsports
print(Nsports)


'''
<script.py> output:
    NOC
    URS    21
    USA    20
    Name: Sport, dtype: int64
'''

In [None]:
# Create the pivot table: medals_won_by_country
medals_won_by_country = medals.pivot_table(index = 'Edition', columns = 'NOC', values = 'Athlete', aggfunc = 'count')

# Slice medals_won_by_country: cold_war_usa_usr_medals
cold_war_usa_usr_medals = medals_won_by_country.loc[1952:1988, ['USA','URS']]

# Create most_medals 
most_medals = cold_war_usa_usr_medals.idxmax(axis = 'columns')

# Print most_medals.value_counts()
print(most_medals.value_counts())

'''
<script.py> output:
    URS    8
    USA    2
    dtype: int64
'''

In [None]:
# Create the DataFrame: usa
usa = medals.loc[medals.NOC == 'USA']

# Group usa by ['Edition', 'Medal'] and aggregate over 'Athlete'
usa_medals_by_year = usa.groupby(['Edition', 'Medal']).Athlete.count()

# Reshape usa_medals_by_year by unstacking
usa_medals_by_year = usa_medals_by_year.unstack(level = 'Medal')

# Plot the DataFrame usa_medals_by_year
usa_medals_by_year.plot()
plt.show()
