### Getting Started

In [1]:
import pandas as pd # Main library we will be working with today
import numpy as np # Helper library

#Helps with formatting numbers
pd.options.display.float_format = '{:,.0f}'.format

#Displays the charts right in the notebook
%matplotlib inline

In [6]:
# Creating a dataframe manually

# pd.DataFrame([1,2,3])
# pd.DataFrame([[1,2,3], [4,5,6]])
# pd.DataFrame([[1,2,3], [4,5,6]], columns=['col 1', 'col 2', 'col 3'])
# pd.DataFrame([[1,2,3], [4,5,6]], columns=['col 1', 'col 2', 'col 3'], index=[60, 100])

# Creating a dataframe quickly from a CSV
# https://www.kaggle.com/harlfoxem/housesalesprediction
df = pd.read_csv("kc_house_data.csv")

### Exploring Your Data

In [12]:
# df.head(5)
# df.columns
# df.info()
# df.describe()
# df[['id', 'price', 'bedrooms']].head(10)

### Creating a new column

In [17]:
# Create new column with single value
# df['greg'] = 1

# Create new column based off of another one
# df['tester'] = df['price']*2

# View a subsection of columns only
# df[['greg', 'tester', 'price']].head()

# Drop columns. Axis=1 means drop columns, not rows
# df.drop(['greg', 'tester'], axis=1, inplace=True)

# df.head()

### Filtering Your Data

#### Method 1 - Beware, can get sloppy

In [25]:
# Data frames can be filtered via True/False arrays
# pd.DataFrame(['Rick', 'Bob', 'Katie'])
# pd.DataFrame(['Rick', 'Bob', 'Katie'])[[True, True, True]]
# pd.DataFrame(['Rick', 'Bob', 'Katie'])[[True, False, True]]

# df['price']
# df['price'] > 600000
# df[df['price'] > 600000]
# df[(df['price'] > 662500) & (df['bedrooms'] >= 4)]

#### Method 2 - Longer, but cleaner

In [27]:
# Method 2 - Longer, but cleaner -- Put your conditions in variables
# c1 = df['price'] > 662500
# c2 = df['bedrooms'] >= 3

# df[c1 & c2].head(5)

### Group by columns

In [31]:
# df['bedrooms'].unique()
# df.groupby(['bedrooms'])['price'].mean()
# df.groupby(['bedrooms', 'waterfront'])['price'].mean()

### Pivot Tables

In [33]:
# pd.pivot_table(df, values='price', index=['bedrooms'], columns=['waterfront'], aggfunc=np.mean)

### Plotting data

In [38]:
# Pandas as some built in plot functionality
# df[['price', 'bedrooms']].plot.scatter(y='price', x='bedrooms')

# Filtering to a data range
# df_bed = df[df['bedrooms'] < 15]
# df_bed[['price', 'bedrooms']].plot.scatter(y='price', x='bedrooms')
# df_bed[['price', 'bedrooms']].plot.scatter(y='price', x='bedrooms', alpha=.1)

# Using a scatter plot to chart lat and long
# df.plot.scatter(x='long', y='lat', s=1, figsize=(15,15), alpha=.2)

### Bucketing Your Data

In [48]:
# df['price']

# Pandas making 3 buckets for you
# df['price_bin'] = pd.cut(df['price'], bins=10)

# Manually telling Pandas which buckets to create
# df['price_bin'] = pd.cut(df['price'], bins=[0, 100000, 300000, 500000, 700000, 8000000])

# Adding in labels for readability
# df['price_bin'] = pd.cut(df['price'], bins=[0, 100000, 300000, 500000, 700000, 8000000],\
#                          labels=['mini', 'small', 'med', 'large', 'XL'])

# df[['price', 'price_bin']].head(20)

### Iterating over every row in column

In [1]:
# df[['id', 'yr_built']].head(10)

# Using map to iterate over a series with lambda
# df['yr_built'].map(lambda x: ("This house was built in %s" % (x))).head(10)

# Creating a function that we will iterate over our Series with
# def over_under(year_built):
#     if year_built >= 1960:
#         return "After or on 1960 on %s" % (year_built)
#     else:
#         return "Before 1960 on %s" % (year_built)

# df['yr_built'].map(lambda x: over_under(x)).head(10)

### Where to learn more

In [53]:
# Common Excel Tasks Translated To Pandas
# https://pbpython.com/excel-pandas-comp.html