In [None]:
# Import pandas
import pandas as pd

# Read in filename and set the index: election
election = pd.read_csv(filename, index_col ='county')

# Create a separate dataframe with the columns ['winner', 'total', 'voters']: results
results = election[['winner', 'total', 'voters']]

# Print the output of results.head()
print(results.head())


In [None]:
# Slice the columns from the starting column to 'Obama': left_columns
left_columns = election.loc[:,'state':'Obama']

# Print the output of left_columns.head()
print(left_columns.head())

# Slice the columns from 'Obama' to 'winner': middle_columns
middle_columns = election.loc[:,'Obama':'winner']

# Print the output of middle_columns.head()
print(middle_columns.head())

# Slice the columns from 'Romney' to the end: 'right_columns'
right_columns = election.loc[:,'Romney':]

# Print the output of right_columns.head()
print(right_columns.head())


In [None]:
# Create the list of row labels: rows
rows = ['Philadelphia', 'Centre', 'Fulton']

# Create the list of column labels: cols
cols = ['winner', 'Obama', 'Romney']

# Create the new DataFrame: three_counties
three_counties = election.loc[rows,cols]

# Print the three_counties DataFrame
print(three_counties)


In [None]:
# Create the boolean array: high_turnout
high_turnout = election.turnout > 70

# Filter the election DataFrame with the high_turnout array: high_turnout_df
high_turnout_df = election[high_turnout]

# Print the high_turnout_results DataFrame
print(high_turnout_df)


In [None]:
# Import numpy
import numpy as np

# Create the boolean array: too_close
too_close = election.margin < 1

# Assign np.nan to the 'winner' column where the results were too close to call
election.winner[too_close] = np.nan

# Print the output of election.info()
print(election.info())

In [None]:
# In certain scenarios, it may be necessary to remove rows and columns with
# missing data from a DataFrame. The .dropna() method is used to perform this action. 

# Select the 'age' and 'cabin' columns: df
df = titanic.loc[:,['age','cabin']]

# Print the shape of df
print(df.shape)

# Drop rows in df with how='any' and print the shape
print(df.dropna(how = 'any').shape)

# Drop rows in df with how='all' and print the shape
print(df.dropna(how = 'all').shape)

# Call .dropna() with thresh=1000 and axis='columns' and print the output of .info() from titanic
print(titanic.dropna(thresh=1000, axis='columns').info())



In [3]:
# The .apply() method can be used on a pandas DataFrame to apply an arbitrary Python function to every element. 

# Write a function to convert degrees Fahrenheit to degrees Celsius: to_celsius
def to_celsius(F):
    return 5/9*(F - 32)

# Apply the function over 'Mean TemperatureF' and 'Mean Dew PointF': df_celsius
df_celsius = weather.loc[:,['Mean TemperatureF', 'Mean Dew PointF']].apply(to_celsius)

# Reassign the columns df_celsius
df_celsius.columns = ['Mean TemperatureC', 'Mean Dew PointC']

# Print the output of df_celsius.head()
print(df_celsius.head())


2

In [None]:
# The .map() method is used to transform values according to a Python dictionary look-up.
# Create the dictionary: red_vs_blue
red_vs_blue = {'Obama':'blue', 'Romney':'red'}

# Use the dictionary to map the 'winner' column to the new column: election['color']
election['color'] = election.winner.map(red_vs_blue)

# Print the output of election.head()
print(election.head())


In [None]:
# When performance is paramount, you should avoid using .apply() and .map()
# because those constructs perform Python for-loops over the data stored in 
# a pandas Series or DataFrame. By using vectorized functions instead, you
# can loop over the data at the same speed as compiled code (C, Fortran, etc.)!
# NumPy, SciPy and pandas come with a variety of vectorized functions
# (called Universal Functions or UFuncs in NumPy).

# You can even write your own vectorized functions, but for now we will focus
# on the ones distributed by NumPy and pandas.

# In this exercise you're going to import the zscore method from scipy.stats and
# use it to compute the deviation in voter turnout in Pennsylvania from the mean
# in fractions of the standard deviation. In statistics, the z-score is the number
# of standard deviations by which an observation is above the mean - so if it is
# negative, it means the observation is below the mean.

# Instead of using .apply() as you did in the earlier exercises, the zscore UFunc
# will take a pandas Series as input and return a NumPy array. You will then assign
# the values of the NumPy array to a new column in the DataFrame. 


# Import zscore from scipy.stats
from scipy.stats import zscore

# Call zscore with election['turnout'] as input: turnout_zscore
turnout_zscore = zscore(election['turnout'])

# Print the type of turnout_zscore
print(type(turnout_zscore))

# Assign turnout_zscore to a new column: election['turnout_zscore']
election['turnout_zscore'] = turnout_zscore

# Print the output of election.head()
print(election.head())


In [None]:
# Indexes are immutable objects. This means that if you want to change 
# or modify the index in a DataFrame, then you need to change the whole index. 

# A list comprehension is a succinct way to generate a list in one line.
# For example, the following list comprehension generates a list that contains
# the cubes of all numbers from 0 to 9: 
cubes = [i**3 for i in range(10)].
#This is equivalent to the following code:

cubes = []
for i in range(10):
    cubes.append(i**3)
    
# Create the list of new indexes: new_idx
new_idx = [i.upper() for i in sales.index]

# Assign new_idx to sales.index
sales.index = new_idx

# Print the sales DataFrame
print(sales)


In [4]:
# Assign the string 'MONTHS' to sales.index.name
sales.index.name = 'MONTHS'

# Print the sales DataFrame
print(sales)

# Assign the string 'PRODUCTS' to sales.columns.name 
sales.columns.name = 'PRODUCTS'

# Print the sales dataframe again
print(sales)


[0, 1, 8, 27, 64, 125, 216, 343, 512, 729]

In [None]:
# You can also build the DataFrame and index independently, and then put them
# together. If you take this route, be careful, as any mistakes in generating
# the DataFrame or the index can cause the data and the index to be aligned incorrectly.

# Generate the list of months: months
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']

# Assign months to sales.index
sales.index = months

# Print the modified sales DataFrame
print(sales)


In [None]:
# Print sales.loc[['CA', 'TX']]
print(sales.loc[['CA','TX']])

# Print sales['CA':'TX']
print(sales['CA':'TX'])


In [None]:
# With a MultiIndex, you should always ensure the index is sorted. You can
# skip this only if you know the data is already sorted on the index fields.

# Set the index to be the columns ['state', 'month']: sales
sales = sales.set_index(['state','month'])

# Sort the MultiIndex: sales
sales = sales.sort_index()

# Print the sales DataFrame
print(sales)

In [None]:
# Set the index to the column 'state': sales
sales = sales.set_index('state')

# Print the sales DataFrame
print(sales)

# Access the data from 'NY'
print(sales.loc['NY'])


In [None]:
# Looking up data based on inner levels of a MultiIndex can be a bit trickier.
# In this exercise, you will use your sales DataFrame to do some increasingly complex lookups.

# The trickiest of all these lookups are when you want to access some inner levels of the
# index. In this case, you need to use slice(None) in the slicing parameter for the outermost
# dimension(s) instead of the usual :, or use pd.IndexSlice. You can refer to the pandas
# documentation for more details. For example, in the video, Dhavide used the following
# code to extract rows from all Symbols for the dates Oct. 3rd through 4th inclusive:

# stocks.loc[(slice(None), slice('2016-10-03', '2016-10-04')), :]

# Look up data for NY in month 1: NY_month1
NY_month1 = sales.loc[('NY', 1), :]

# Look up data for CA and TX in month 2: CA_TX_month2
CA_TX_month2 = sales.loc[(['CA', 'TX'],2),:]

# Look up data for all states in month 2: all_month2
all_month2 = sales.loc[(slice(None), 2),:]

In [None]:
# Pivot the users DataFrame: visitors_pivot
visitors_pivot = users.pivot(index = 'weekday', columns = 'city', values = 'visitors')

# Print the pivoted DataFrame
print(visitors_pivot)

''' print(users)
  weekday    city  visitors  signups
0     Sun  Austin       139        7
1     Sun  Dallas       237       12
2     Mon  Austin       326        3
3     Mon  Dallas       456        5

<script.py> output:
    city     Austin  Dallas
    weekday                
    Mon         326     456
    Sun         139     237'
'''



In [None]:
# If you do not select any particular variables, all of them will be pivoted. 
# In this case - with the users DataFrame - both 'visitors' and 'signups' will
# be pivoted, creating hierarchical column labels.

# Pivot users with signups indexed by weekday and city: signups_pivot
signups_pivot = users.pivot(index = 'weekday', columns = 'city',values= 'signups')

# Print signups_pivot
print(signups_pivot)

# Pivot users pivoted by both signups and visitors: pivot
pivot = users.pivot(index = 'weekday', columns = 'city')

# Print the pivoted DataFrame
print(pivot)


'''
 users
Out[7]: 
  weekday    city  visitors  signups
0     Sun  Austin       139        7
1     Sun  Dallas       237       12
2     Mon  Austin       326        3
3     Mon  Dallas       456        5

<script.py> output:
    city     Austin  Dallas
    weekday                
    Mon           3       5
    Sun           7      12
            visitors        signups       
    city      Austin Dallas  Austin Dallas
    weekday                               
    Mon          326    456       3      5
    Sun          139    237       7     12

'''

In [None]:
# Unstack users by 'weekday': byweekday
byweekday = users.unstack(level = 'weekday')

# Print the byweekday DataFrame
print(byweekday)

# Stack byweekday by 'weekday' and print it
print(byweekday.stack(level = 'weekday'))

'''
<script.py> output:
            visitors      signups    
    weekday      Mon  Sun     Mon Sun
    city                             
    Austin       326  139       3   7
    Dallas       456  237       5  12
                    visitors  signups
    city   weekday                   
    Austin Mon           326        3
           Sun           139        7
    Dallas Mon           456        5
           Sun           237       12
'''

In [None]:
# Unstack users by 'city': bycity
bycity = users.unstack(level = 'city')

# Print the bycity DataFrame
print(bycity)

# Stack bycity by 'city' and print it
print(bycity.stack(level = 'city'))

'''
<script.py> output:
            visitors        signups       
    city      Austin Dallas  Austin Dallas
    weekday                               
    Mon          326    456       3      5
    Sun          139    237       7     12
                    visitors  signups
    weekday city                     
    Mon     Austin       326        3
            Dallas       456        5
    Sun     Austin       139        7
            Dallas       237       12
'''

In [None]:
# Continuing from the previous exercise, you will now use .swaplevel(0, 1) to flip
# the index levels. Note they won't be sorted. To sort them, you will have to follow
# up with a .sort_index(). You will then obtain the original DataFrame. Note that an
# unsorted index leads to slicing failures.

# Stack 'city' back into the index of bycity: newusers
newusers = bycity.stack(level = 'city')

# Swap the levels of the index of newusers: newusers
newusers = newusers.swaplevel(0,1)

# Print newusers and verify that the index is not sorted
print(newusers)

# Sort the index of newusers: newusers
newusers = newusers.sort_index()

# Print newusers and verify that the index is now sorted
print(newusers)

# Verify that the new DataFrame is equal to the original
print(newusers.equals(users))



'''
<script.py> output:
                    visitors  signups
    city   weekday                   
    Austin Mon           326        3
    Dallas Mon           456        5
    Austin Sun           139        7
    Dallas Sun           237       12
                    visitors  signups
    city   weekday                   
    Austin Mon           326        3
           Sun           139        7
    Dallas Mon           456        5
           Sun           237       12
    True
'''

In [None]:
# the goal of melting is to restore a pivoted DataFrame to its original form,
# or to change it from a wide shape to a long shape. You can explicitly specify
# the columns that should remain in the reshaped DataFrame with id_vars, and
# list which columns to convert into values with value_vars. As Dhavide demonstrated,
# if you don't pass a name to the values in pd.melt(), you will lose the name of your
# variable. You can fix this by using the value_name keyword argument.

'''
In [1]: visitors_by_city_weekday
Out[1]: 
city     Austin  Dallas
weekday                
Mon         326     456
Sun         139     237
'''


# Reset the index: visitors_by_city_weekday
visitors_by_city_weekday = visitors_by_city_weekday.reset_index() 

# Print visitors_by_city_weekday
print(visitors_by_city_weekday)

# Melt visitors_by_city_weekday: visitors
visitors = pd.melt(visitors_by_city_weekday, id_vars=['weekday'], value_name= 'visitors')

# Print visitors
print(visitors)

'''
<script.py> output:
    city weekday  Austin  Dallas
    0        Mon     326     456
    1        Sun     139     237
      weekday    city  visitors
    0     Mon  Austin       326
    1     Sun  Austin       139
    2     Mon  Dallas       456
    3     Sun  Dallas       237
'''

In [None]:
'''
In [1]: users
Out[1]: 
  weekday    city  visitors  signups
0     Sun  Austin       139        7
1     Sun  Dallas       237       12
2     Mon  Austin       326        3
3     Mon  Dallas       456        5
'''

# You can move multiple columns into a single column (making the data long and skinny) by "melting" multiple columns. 

# Melt users: skinny
skinny = pd.melt(users, id_vars = ['weekday','city'])

# Print skinny
print(skinny)

'''
<script.py> output:
      weekday    city  variable  value
    0     Sun  Austin  visitors    139
    1     Sun  Dallas  visitors    237
    2     Mon  Austin  visitors    326
    3     Mon  Dallas  visitors    456
    4     Sun  Austin   signups      7
    5     Sun  Dallas   signups     12
    6     Mon  Austin   signups      3
    7     Mon  Dallas   signups      5

'''

In [None]:
# Sometimes, all you need is some key-value pairs, and the context does not matter.
# If said context is in the index, you can easily obtain what you want. For example,
# in the users DataFrame, the visitors and signups columns lend themselves well to being
# represented as key-value pairs. So if you created a hierarchical index with 'city' and
# 'weekday' columns as the index, you can easily extract key-value pairs for the 'visitors'
# and 'signups' columns by melting users and specifying col_level=0

# Set the new index: users_idx
users_idx = users.set_index(['city', 'weekday'])

# Print the users_idx DataFrame
print(users_idx)

# Obtain the key-value pairs: kv_pairs
kv_pairs = pd.melt(users_idx, col_level = 0)

# Print the key-value pairs
print(kv_pairs)


'''
<script.py> output:
                    visitors  signups
    city   weekday                   
    Austin Sun           139        7
    Dallas Sun           237       12
    Austin Mon           326        3
    Dallas Mon           456        5
       variable  value
    0  visitors    139
    1  visitors    237
    2  visitors    326
    3  visitors    456
    4   signups      7
    5   signups     12
    6   signups      3
    7   signups      5
'''

In [None]:
# a pivot table allows you to see all of your variables as a function of two other variables.
# In this exercise, you will use the .pivot_table() method to see how the users DataFrame
# entries appear when presented as functions of the 'weekday' and 'city' columns. That is,
# with the rows indexed by 'weekday' and the columns indexed by 'city'.

'''
In [1]: users
Out[1]: 
  weekday    city  visitors  signups
0     Sun  Austin       139        7
1     Sun  Dallas       237       12
2     Mon  Austin       326        3
3     Mon  Dallas       456        5
'''

# Create the DataFrame with the appropriate pivot table: by_city_day
by_city_day = users.pivot_table(index = 'weekday', columns = 'city')

# Print by_city_day
print(by_city_day)


'''
<script.py> output:
            visitors        signups       
    city      Austin Dallas  Austin Dallas
    weekday                               
    Mon          326    456       3      5
    Sun          139    237       7     12
'''
