In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import seaborn as sns
from matplotlib import rcParams

%matplotlib inline 
%pylab inline 

In [None]:
df = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')
df.head()

Checking to see if any of our data has null values. If there were any, we抎 drop or filter the null values out.

In [None]:
df.isnull().any()

Checking out the data types for each of our variables

In [None]:
df.dtypes

# **Exploratory analysis and regression results**

In [None]:
df.describe()

Let's print two histograms to observe the distribution of housing prices and square footage using **matplotlib (plt)**

In [None]:
fig = plt.figure(figsize=(12, 6))
sqft = fig.add_subplot(121)
cost = fig.add_subplot(122)

sqft.hist(df.sqft_living, bins=80)
sqft.set_xlabel('Ft^2')
sqft.set_title("Histogram of House Square Footage")

cost.hist(df.price, bins=80)
cost.set_xlabel('Price ($)')
cost.set_title("Histogram of Housing Prices")

plt.show()

# Linear Regression

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

OLS means Ordinary Least Squares. 
When you code to produce a linear regression summary with OLS with only two variables this will be the formula that you use:


```
Reg = ols(慏ependent variable ~ independent variable(s), dataframe).fit()

print(Reg.summary()) **bold text**
```



In [None]:
m = ols('price ~ sqft_living',df).fit()
print (m.summary())

# Multivariate linear regression

In [None]:
m = ols('price ~ sqft_living + bedrooms + grade + condition',df).fit()
print (m.summary())

In [None]:
sns.jointplot(x="sqft_living", y="price", data=df, kind = 'reg',fit_reg= True, size = 7)
plt.show()

# Clustering Model in Python

**Step One: Exploratory Data Analysis**

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import sklearn
from sklearn import cluster

%matplotlib inline

faithful = pd.read_csv('../input/faithful/faithful.csv')
faithful.head()

In [None]:
faithful.columns = ['eruptions', 'waiting']

plt.scatter(faithful.eruptions, faithful.waiting)
plt.title('Old Faithful Data Scatterplot')
plt.xlabel('Length of eruption (minutes)')
plt.ylabel('Time between eruptions (minutes)')

**Step two: Building the cluster model**

In [None]:
faith = np.array(faithful)

k = 2
kmeans = cluster.KMeans(n_clusters=k)
kmeans.fit(faith)

labels = kmeans.labels_
centroids = kmeans.cluster_centers_

In [None]:
for i in range(k):
    # select only data observations with cluster label == i
    ds = faith[np.where(labels==i)]
    # plot the data observations
    plt.plot(ds[:,0],ds[:,1],'o', markersize=7)
    # plot the centroids
    lines = plt.plot(centroids[i,0],centroids[i,1],'kx')
    # make the centroid x's bigger
    plt.setp(lines,ms=15.0)
    plt.setp(lines,mew=4.0)
plt.show()