## Brief Data Science Intro

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

### Step 1:  Load Data Into Pandas

In [None]:
# here we'll load a dataframe from a csv file
df = pd.read_csv('https://raw.githubusercontent.com/JonathanBechtel/data/master/boston.csv')
df.drop(df.columns[0], axis=1, inplace=True)
# the head gives us the first 5 rows in the dataframe
df.head(10)

In [None]:
# pull descriptive statistics on each of our columns
df.describe()

In [None]:
# find out how many missing values we have
df.isnull().sum()

### Exploratory Data Analysis

In [None]:
# we can also use matplotlib and seaborn to quickly create different visuals
sns.pairplot(df);

In [None]:
sns.distplot(df['PRICE']);

### Model Fitting With Scikit Learn

In [None]:
# next, we'll use our dataframe to implement a linear model with scikit learn
model = LinearRegression(n_jobs=-1)
X = df[['CRIM', 'RM', 'DIS', 'TAX', 'PTRATIO', 'LSTAT']]
y = df['PRICE']
model.fit(X, y) # here we're training the model on the data

In [None]:
# we're creating a new column with the predictions from our data
df['PREDICTION'] = model.predict(X)
df.head()

In [None]:
df['Error'] = df['PRICE'] - df['PREDICTION'] # this column is the difference between the Price and Prediction -- the Error
df.head()

In [None]:
# now let's put this in a more readable form
variable_strength = pd.DataFrame({'Variable':    df.columns[0:6],
                             'Weight': model.coef_
                            }).sort_values(by='Weight', ascending=False)

variable_strength

### How Did We Do?

In [None]:
sns.regplot(df['PRICE'], df['PREDICTION']);