# Introduction
**This will be your workspace for the [Machine Learning course](https://www.kaggle.com/learn/machine-learning).**

You will need to translate the concepts to work with the data in this notebook, the Iowa data. Each page in the Machine Learning course includes instructions for what code to write at that step in the course.

# Write Your Code Below

In [None]:
import pandas as pd

main_file_path = '../input/house-prices-advanced-regression-techniques/train.csv' # this is the path to the Iowa data that you will use
data = pd.read_csv(main_file_path)

# Run this code block with the control-enter keys on your keyboard. Or click the blue botton on the left
print(data.describe())  # print a summary of the data in house prices data

In [None]:
openporch_data=data.OpenPorchSF #stores the OPenPorchSF column data as the variable openporch_data
print(openporch_data.head()) # the head command returns the top few lines of the OpenPorchSF column
print(openporch_data)

In [None]:
print(data.columns) #prints out all the columns in the data

In [None]:
#selecting multiple columns from the house prices dataframe
columns_of_interest = ['Alley', 'LandContour', 'Fence']
columns_of_data = data[columns_of_interest]
print(columns_of_data)
columns_of_data.describe()

In [None]:
from sklearn.tree import DecisionTreeRegressor # used to make predictions from certain data
#predicting sales price based on the 7 numeric variables without missing values
y=data.SalePrice
predictors=['1stFlrSF','YearBuilt', 'LotArea', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
x=data[predictors]

model=DecisionTreeRegressor() #define model
model.fit(x,y) #fit model
print("Making predictions for the following 5 houses:")
print(x.head())
print("The predictions are")
print(model.predict(x.head()))


In [None]:
#calculates mean absolyte error
from sklearn.metrics import mean_absolute_error

predicted_home_prices = model.predict(x)
mean_absolute_error(y, predicted_home_prices)

In [None]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both predictors and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(x, y,random_state = 0)
# Define model
house_model = DecisionTreeRegressor()
# Fit model
house_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = house_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))



In [None]:
#prevents underfitting and overfitting
def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    new_model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    new_model.fit(predictors_train, targ_train)
    preds_val = new_model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

In [None]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t  Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model1 = RandomForestRegressor()
forest_model1.fit(train_X, train_y)
house_preds = forest_model1.predict(val_X)
print(mean_absolute_error(val_y, house_preds))

In [None]:
main_file_path1 = '../input/house-prices-advanced-regression-techniques/sample_submission.csv' # this is the path to the Iowa data that you will use
data1 = pd.read_csv(main_file_path1)
print(data1.columns)
y1=data1.SalePrice
predictors=['SalePrice']
x1=data1[predictors]
model1=DecisionTreeRegressor() #define model
model1.fit(x1,y1) #fit model
prediction=model1.predict(x1)
print(prediction)
submission3 = pd.DataFrame({'Id': data1.Id, 'SalePrice': prediction})

submission3.to_csv('submission4.csv', index=False)

In [None]:
#find and counts all the missing data in each column
missing_val_count_by_column = (data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0]) 

In [None]:
#data_without_missing_values = data.dropna(axis=1) # drops columns with missing data

In [None]:
#cols_with_missing = [col for col in data.columns 
                               #  if data[col].isnull().any()]
#redued_original_data = data.drop(cols_with_missing, axis=1)


In [None]:
#predicts a value to fit with culmns that have missing data for numbers
from sklearn.impute import SimpleImputer
new_data = data1.copy()

# make new columns indicating what will be imputed
cols_with_missing = (col for col in new_data.columns 
                                 if new_data[col].isnull().any())
for col in cols_with_missing:
    new_data[col + '_was_missing'] = new_data[col].isnull()

# Imputation
my_imputer = SimpleImputer()
new_data = pd.DataFrame(my_imputer.fit_transform(new_data))
new_data.columns = data1.columns

In [None]:
#used for predicting any columns with numbers that are null
# make copy to avoid changing original data (when Imputing)
new_data = data1.copy()

# make new columns indicating what will be imputed
cols_with_missing = (col for col in new_data.columns 
                                 if new_data[col].isnull().any())
for col in cols_with_missing:
    new_data[col + '_was_missing'] = new_data[col].isnull()

# Imputation
my_imputer = SimpleImputer()
new_data = pd.DataFrame(my_imputer.fit_transform(new_data))
new_data.columns = data1.columns

In [None]:

from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
imputed_X_train = my_imputer.fit_transform(train_X)
imputed_X_test = my_imputer.transform(val_X)
print("Mean Absolute Error from Imputation:")
print(get_mae(20,imputed_X_train, imputed_X_test, train_y, val_y))

In [None]:
data.dtypes.sample(10) #implies whether the column contains integer or string(object) columns for the first 10 columns

In [None]:
one_hot_encoded_training_predictors = pd.get_dummies(data) 
#encodes object (string) columns so that the code can make predictions from non-numerical data by one-hot encoding them (converting them into a type of numerical data)

In [None]:
#applies one-hot encoding to multiple files
one_hot_encoded_training_predictors = pd.get_dummies(data)
one_hot_encoded_test_predictors = pd.get_dummies(data1)
final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
                                                                    join='left',   #joins data1 dataframe(sample_submission.csv) to the the left of data dataframe(train.csv)
                                                                    axis=1)


In [None]:
print(final_train)

In [None]:
print(final_test) #NaN (too large or not exists) shown due to SaleCondition column not existing in the datafram sample_submission.csv

In [None]:
#XGBoost (gradient boost decision tree) implementation
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

data4 = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
data4.dropna(axis=0, subset=['SalePrice'], inplace=True)
y4 = data4.SalePrice
X4 = data4.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X4.as_matrix(), y4.as_matrix(), test_size=0.25)

my_imputer = Imputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)

In [None]:
#gradient boost decision tree (using a model)
from xgboost import XGBRegressor

my_model = XGBRegressor(silent=True)
# Adding silent=True to avoid printing out updates with each cycle
my_model.fit(train_X, train_y, verbose=False)

In [None]:
# make predictions from gradient boost decision tree
predictions = my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

In [None]:

my_model = XGBRegressor(n_estimators=1000) #n_estimators specifies how many times to go through the modeling cycle 
#In the underfitting vs overfitting graph, n_estimators moves you further to the right. 
#Too low a value causes underfitting, which is inaccurate predictions on both training data and new data. 
#Too large a value causes overfitting, which is accurate predictions on training data, 
#but inaccurate predictions on new data (which is what we care about). You can experiment with your dataset to find the ideal.
#Typical values range from 100-1000, though this depends a lot on the learning rate.

my_model.fit(train_X, train_y, early_stopping_rounds=5,  #The argument early_stopping_rounds offers a way to automatically find the ideal value.
             eval_set=[(test_X, test_y)], verbose=False)
#Early stopping causes the model to stop iterating when the validation score stops improving, even if we aren't at the hard stop for n_estimators. 
#It's smart to set a high value for n_estimators and then use early_stopping_rounds to find the optimal time to stop iterating.
#Since random chance sometimes causes a single round where validation scores don't improve,
#you need to specify a number for how many rounds of straight deterioration to allow before stopping. 
#early_stopping_rounds = 5 is a reasonable value. Thus we stop after 5 straight rounds of deteriorating validation scores.


In [None]:
#using learning_rate
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4) #multiplies the predictions from each model by the learning_rate(0.05)
#n_jobs is the number of cores on the machine (computer)
my_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(test_X, test_y)], verbose=False)
#Here's a subtle but important trick for better XGBoost models:
#Instead of getting predictions by simply adding up the predictions from each component model, 
# multiply the predictions from each model by a small number before adding them in. 
#This means each tree we add to the ensemble helps us less. In practice, this reduces the model's propensity to overfit.
#So, you can use a higher value of n_estimators without overfitting. If you use early stopping, the appropriate number of trees will be set automatically.
#In general, a small learning rate (and large number of estimators) will yield more accurate XGBoost models, 
#though it will also take the model longer to train since it does more iterations through the cycle.

In [None]:
submission6 = pd.DataFrame({'Id': final_test.Id,'SaleCondition Family':final_test.SaleCondition_Family, 'SalePrice': final_test.SalePrice})

submission6.to_csv('submission6.csv', index=False)


In [None]:
print(final_test.columns)

In [None]:
#function for y and X used in partial dependence plot 
def get_some_data():
    cols_to_use = ['GarageArea', 'LotArea', 'GrLivArea']
    data5 = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
    y5 = data5.SalePrice
    X5 = data5[cols_to_use]
    my_imputer2 = Imputer()
    imputed_X5 = my_imputer2.fit_transform(X5)
    return imputed_X5, y5


In [None]:
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn.ensemble import GradientBoostingRegressor

# get_some_data is defined in hidden cell above.
X5, y5 = get_some_data()
# scikit-learn originally implemented partial dependence plots only for Gradient Boosting models
# this was due to an implementation detail, and a future release will support all model types.
my_model3 = GradientBoostingRegressor()
# fit the model as usual
my_model3.fit(X5, y5)
# Here we make the plot
my_plotsGarArea = plot_partial_dependence(my_model3,       
                                   features=[0], # column numbers of plots we want to show
                                   X=X5,            # raw predictors data.
                                   feature_names=['Garage Area','Lot Area', 'Green Live Area'], # labels on graphs
                                   grid_resolution=10) # number of values to plot on x axis
my_plotsGarArea = plot_partial_dependence(my_model3,       
                                   features=[1], # column numbers of plots we want to show
                                   X=X5,            # raw predictors data.
                                   feature_names=['Garage Area','Lot Area', 'Green Live Area'], # labels on graphs
                                   grid_resolution=10) # number of values to plot on x axis
my_plotsGarArea = plot_partial_dependence(my_model3,       
                                   features=[2], # column numbers of plots we want to show
                                   X=X5,            # raw predictors data.
                                   feature_names=['Garage Area','Lot Area', 'Green Live Area'], # labels on graphs
                                   grid_resolution=10) # number of values to plot on x axis


The first graph shows that a larger garage area results in a higher house price.
The second graph shows that a larger Lot area results in a higher house price.
The third graph shows that a larger Green Luve area results in a higher house price.

In [None]:
#pipelining for cleaner, productionised and efficient code
import pandas as pd
from sklearn.model_selection import train_test_split

# Read Data
data6 = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
cols_to_use = ['GarageArea', 'LotArea', 'GrLivArea','YrSold','PoolArea']
X6 = data6[cols_to_use]
y6 = data6.SalePrice
train_X6, test_X6, train_y6, test_y6 = train_test_split(X6, y6)

In [None]:
#You have a modeling process that uses an Imputer to fill in missing values, 
#followed by a RandomForestRegressor to make predictions. These can be bundled together with the make_pipeline function .
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer

my_pipeline = make_pipeline(Imputer(), RandomForestRegressor())

In [None]:
#fit and predict using this pipeline as a fused whole.
my_pipeline.fit(train_X6, train_y6)
predictions6 = my_pipeline.predict(test_X6)

In [None]:
#This is the code to do the same thing without pipelines
my_imputer6 = Imputer()
my_model7 = RandomForestRegressor()

imputed_train_X6 = my_imputer6.fit_transform(train_X6)
imputed_test_X6 = my_imputer6.transform(test_X6)
my_model7.fit(imputed_train_X6, train_y6)
predictions6 = my_model7.predict(imputed_test_X6)

Most scikit-learn objects are either transformers or models.

**Transformers** are for pre-processing before modeling. The Imputer class (for filling in missing values) is an example of a transformer. Over time, you will learn many more transformers, and you will frequently use multiple transformers sequentially.

**Models** are used to make predictions. You will usually preprocess your data (with transformers) before putting it in a model.

You can tell if an object is a transformer or a model by how you apply it. After fitting a transformer, you apply it with the transform command. After fitting a model, you apply it with the predict command. Your pipeline must start with transformer steps and end with a model.

In [None]:
#read the data
import pandas as pd
data7 = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
cols_to_use = ['GarageArea', 'LotArea', 'GrLivArea','YrSold','PoolArea']
X7 = data7[cols_to_use]
y7 = data7.SalePrice

In [None]:
#specify a pipeline of our modeling steps (It can be very difficult to do cross-validation properly if you arent't using pipelines)
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
my_pipeline2 = make_pipeline(Imputer(), RandomForestRegressor())


In [None]:
#get the cross-validation scores
from sklearn.model_selection import cross_val_score
scores = cross_val_score(my_pipeline2, X7, y7, scoring='neg_mean_absolute_error') #scoring specifies what measure of model quality to report 
#neg_mean_absolute_error stands for negative mean absolute error
#Scikit-learn has a convention where all metrics are defined so a high number is better. 
#Using negatives here allows metrics to be consistent with that convention, though negative MAE is almost unheard of elsewhere.
print(scores)


In [None]:
#typically a single measure of model quality is wanted to compare between models. So we take the average across experiments.
print('Mean Absolute Error %2f' %(-1 * scores.mean()))

There are two main types of data leakage: **Leaky predictors** and **Leaky validation Strategies**.   **Leaky predictors** occur when your predictors include data that will not be available at the time you make predictions. To prevent this type of data leakage, any variable updated (or created) after the target value is realized should be excluded. Because when we use this model to make new predictions, that data won't be available to the model.   
**Leaky Validation Strategies** : A much different type of leak occurs when you aren't careful distinguishing training data from validation data. For example, this happens if you run preprocessing (like fitting the Imputer for missing values) before calling train_test_split. Validation is meant to be a measure of how the model does on data it hasn't considered before. You can corrupt this process in subtle ways if the validation data affects the preprocessing behavoir.. The end result? Your model will get very good validation scores, giving you great confidence in it, but perform poorly when you deploy it to make decisions. To prevent or find** leaky predictors**:To screen for possible leaky predictors, look for columns that are statistically correlated to your target; or If you build a model and find it extremely accurate, you likely have a leakage problem. To prevent **Leaky Validation Strategies**: If your validation is based on a simple train-test split, exclude the validation data from any type of fitting, including the fitting of preprocessing steps. This is easier if you use scikit-learn Pipelines. When using cross-validation, it's even more critical that you use pipelines and do your preprocessing inside the pipeline.

In [None]:
#shows the data for the first 5 rows
import pandas as pd

data10 = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', 
                   true_values = ['yes'],
                   false_values = ['no'])  #always use both true_values=['yes'] and false_values=['no']
print(data10.head())

In [None]:
data10.shape #how many rows and columns are in the dataset

In [None]:
#calculates the cross validation accuracy (mainly for a small amount of data with data columns consisting of 1s (true) and 0s(false))
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

one_hot_encoded_training_predictors2 = pd.get_dummies(data10) 

y20 = one_hot_encoded_training_predictors2.SaleCondition_Partial.head()
X20 = one_hot_encoded_training_predictors2.head().drop(['SaleCondition_Partial'], axis=1)

# Since there was no preprocessing, we didn't need a pipeline here. Used anyway as best practice
modeling_pipeline4 = make_pipeline(RandomForestClassifier())
cv_scores = cross_val_score(modeling_pipeline4, X20, y20, scoring='accuracy')
print("Cross-val accuracy: %f" %cv_scores.mean())

In [None]:
print(one_hot_encoded_training_predictors2.columns) # finding the columns in the dataset

In [None]:
#the highest accuracy of 1.000000 shows a data leakage. Models should be accurate but less than about 0.93
#shows that one of the variables/all the variables in potential_leaks are causing data leakage
potential_leaks = ['SaleType_New', 'SaleCondition_Normal', 'SaleCondition_Abnorml', 'SaleType_ConLw']
X21 = X20.drop(potential_leaks, axis=1)
cv_scores = cross_val_score(modeling_pipeline4, X21, y20, scoring='accuracy')
print("Cross-val accuracy: %f" %cv_scores.mean())


In [None]:
#shows that SaleType_WD makes tthe model absolutely inaccurate
SaleconditionNormal = one_hot_encoded_training_predictors2.SaleType_WD[one_hot_encoded_training_predictors2.SaleCondition_Partial]
SaleconditionNotNormal = one_hot_encoded_training_predictors2.SaleType_WD[~one_hot_encoded_training_predictors2.SaleCondition_Partial]

print('Fraction of those who received a card with no expenditures: %.10f' %(( SaleconditionNormal == 0).mean())) #.10f stands for 10 decimal places
print('Fraction of those who received a card with no expenditures: %.10f'%((SaleconditionNotNormal == 0).mean()))

In [None]:
final_test.to_csv('myDataFrame.csv')


In [None]:
lessData=one_hot_encoded_training_predictors.loc[:,['Id','SalePrice']]
lessData.to_csv("data2to2.csv")

In [None]:
import matplotlib.pyplot as plt #plt is used for multiple graph plotting
fig, axarr = plt.subplots(2, 2, figsize=(12, 8))
data['LotFrontage'].value_counts().head(10).plot.bar(title='LotFrontage bar chart', ax=axarr[0][0]) #plots for the first 10 pieces of data
data['LotFrontage'].value_counts().head(10).plot.bar(ax=axarr[0][1]) #plots a bar chart of the relative proportions for the first 10 pieces of data
data['SalePrice'].head(25).value_counts().sort_index().plot.bar(ax=axarr[1][0]) #plots a barchart ascending in sale price for an ordinal (numerical) variable for the 1st 25 pieces of data
data['SalePrice'].head(5).value_counts().sort_index().plot.line(ax=axarr[1][1]) #line chart for a continuous dependent variable for the first 100 pieces of data
#axarr is for [row][column]position of graph

In [None]:
data['SalePrice'].head(100).value_counts().sort_index().plot.line() #line chart for a continuous dependent variable for the first 100 pieces of data

In [None]:
data['SalePrice'].head(100).value_counts().sort_index().plot.area() #area chart (similar to a line chart) for the first 100 pieces of data

Examples of **interval** variables are the wind speed in a hurricane, shear strength in concrete, and the temperature of the sun. An **interval** variable goes beyond an ordinal categorical variable: it has a meaningful order, in the sense that we can quantify what the difference between two entries is itself an **interval** variable.

In [None]:
data[data['SalePrice'] < 300000]['SalePrice'].plot.hist() #plots a histogram dor SalePrices less than 300000

In [None]:
data['SalePrice'].plot.hist() #plots a hisogram for all the data in SalePrice

In [None]:
data[data['SalePrice'] < 60000] #shows all the records with SalePrices < 60000

In [None]:
data[data['SalePrice'] < 200000].sample(100).plot.scatter(x='SalePrice', y='LotArea') #plots a scatter graph fr the 1st 100 pieces of data with SalePrices less than 200000
print("\n") #leaves a line
data[data['SalePrice'] < 200000].plot.scatter(x='SalePrice', y='LotArea') #plots a scatter graph for all of the SalePrices less than 200000
print("\n") #leaves a line
data[data['SalePrice'] < 200000].plot.hexbin(x='SalePrice', y='LotArea', gridsize=15) #plots a hex graph for SalePrices less tha 200000

In [None]:
data1.head(20).plot.bar(stacked=True) #bivariate bar chart (when the dependent variable has more than 2 pieces of data) for the first 20 pieces of data
print("\n") #leaves a line
data1.head(20).plot.area() #bivariate area graph
print("\n") #leaves a line
data1.head(20).plot.line() #bivariate line graph

In [None]:
import seaborn as sns #for seaborn plotting
sns.countplot(data['SalePrice'].head(5)) #seaborn countplot for the first 5 pieces of SalePrices data (shows the probability of each section of prices occuring)


In [None]:
import seaborn as sns #for seaborn plotting
sns.kdeplot(data.query('SalePrice < 200000').SalePrice, color='mediumvioletred') #shows the probability of each SalePrice occuring

In [None]:
import seaborn as sns #for seaborn plotting
data[data['SalePrice'] < 200000]['SalePrice'].value_counts().sort_index().plot.line() #estimates how many times each SalePrice occurs

In [None]:
import seaborn as sns #for seaborn plotting
sns.kdeplot(data[data['SalePrice'] < 100000].loc[:, ['SalePrice', 'LotArea']].dropna().sample(50)) #one type of biavriate KDE plot showinf the distribution of SalePrices and LotArea

In [None]:
import seaborn as sns #for seaborn plotting
sns.distplot(data['LotArea'], bins=10, kde=False) #displot for LotArea

In [None]:
import seaborn as sns #for seaborn plotting
sns.jointplot(x='SalePrice', y='LotArea', data=data[data['SalePrice'] < 100000]) #jointplot for SalePrice (dependent variable) and LotArea (independent variable) for scatter graph and histogram

In [None]:
import seaborn as sns #for seaborn plotting
sns.jointplot(x='SalePrice', y='LotArea', data=data[data['SalePrice'] < 100000], kind='hex', 
              gridsize=20) #jointplot for SalePrice (dependent variable) and LotArea (independent variable) for hex graph and histogram

In [None]:
import seaborn as sns #for seaborn plotting
Bx = data[data.LotShape.isin(data.LotShape.value_counts().head(3).index)]

sns.boxplot(
    x='LotShape',
    y='SalePrice',
    data=Bx
) #box plot for lotshape and saleprice

In [None]:
import seaborn as sns #for seaborn plotting
sns.violinplot(
    x='LotShape',
    y='SalePrice',
    data=data[data.LotShape.isin(data.LotShape.value_counts()[:3].index)]
) #violin plot for lotshape and saleprice

In [None]:
import seaborn as sns #for seaborn plotting (advanced graph plotting)
dta = data

g = sns.FacetGrid(dta, col="LotShape", col_wrap=6) 
g.map(sns.kdeplot, "SalePrice")
#shows the distribution of the categorical variable lotshape with saleprice

In [None]:
import seaborn as sns #for seaborn plotting (advanced graph plotting)
dta2 = data[data['Street'].isin(['Pave', 'Grvl'])]
dta2 = dta2[dta2['HouseStyle'].isin(['2Story', '1Story', '1.5Fin'])]

g2 = sns.FacetGrid(dta2, row="Street", col="HouseStyle")
g2.map(sns.violinplot, "SalePrice")
#shows the distribution of Street together with HouseStyle along Saleprice

In [None]:
import seaborn as sns #for seaborn plotting (advanced graph plotting)
sns.pairplot(data[['SalePrice', 'PoolArea', 'LotArea']]) #pairplot

In [None]:
import seaborn as sns

sns.lmplot(x='SalePrice', y='LotArea', hue='HouseStyle', 
           data=data.loc[data['HouseStyle'].isin(['1.5Fin', '1Story', '2Story'])], 
           fit_reg=False)
#multivariate scatter plot

In [None]:
import seaborn as sns

sns.lmplot(x='SalePrice', y='LotArea',  markers=['o', 'x', '*'], hue='HouseStyle', 
           data=data.loc[data['HouseStyle'].isin(['1.5Fin', '1Story', '2Story'])], 
           fit_reg=False)
#multivariate scatter plot with markers

In [None]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go #for plotly graphs

iplot([go.Scatter(x=data.head(1000)['SalePrice'], y=data.head(1000)['LotArea'], mode='markers')])
#basic plotly scatter graph

In [None]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go #for plotly graphs

iplot([go.Histogram2dContour(x=data.head(500)['SalePrice'], 
                             y=data.head(500)['LotArea'], 
                             contours=go.Contours(coloring='heatmap')),
       go.Scatter(x=data.head(1000)['SalePrice'], y=data.head(1000)['LotArea'], mode='markers')])
# KDE plot (what plotly refers to as a Histogram2dContour) and scatter plot of the same data.

In [None]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go #for plotly graphs
dtt = data.assign(n=0).groupby(['SalePrice', 'LotArea'])['n'].count().reset_index()
dtt = dtt[dtt["LotArea"] < 2000]
ver = dtt.pivot(index='LotArea', columns='SalePrice', values='n').fillna(0).values.tolist()
iplot([go.Surface(z=ver)])
#plotly Surface (the most impressive feature)

In [None]:
from plotnine import * #plotline graphs
dta3 = data.head(1000)

(
    ggplot(dta3)
        + aes('SalePrice', 'LotArea')
        + geom_point()
        + stat_smooth()
)
#plots a line of best fit (logistic regression) along the scatter graph

In [None]:
from plotnine import * #plotline graphs
dta4 = data.head(1000)

(
    ggplot(dta4)
        + aes('SalePrice', 'LotArea')
        + geom_point()
        + aes(color='SalePrice')
        + stat_smooth()
)
#plots a line of best fit (logistic regression) along the scatter graph with coloured points

In [None]:
from plotnine import * #plotline graphs
dta5 = data.head(1000)

(ggplot(dta5)
     + aes('SalePrice', 'LotArea')
     + aes(color='SalePrice')
     + geom_point()
     + stat_smooth()
     + facet_wrap('HouseStyle')
)
#applying faceting with the categorical variable HouseStyle

In [None]:
from plotnine import * #plotline graphs
(ggplot(data)
         + aes('SalePrice', 'HouseStyle')
         + geom_bin2d(bins=20)
         + ggtitle("Most Common house styles")
)
#The plotnine equivalent of a hexplot, a two-dimensional histogram, is geom_bin2d

In [None]:
from pandas.plotting import autocorrelation_plot

autocorrelation_plot(data['SalePrice'])

#The autocorrelation plot is a multivariate summarization-type plot that lets you check every periodicity at the same time.
#It does this by computing a summary statistic梩he correlation score梐cross every possible lag in the dataset. This is known as autocorrelation.
#In an autocorrelation plot the lag is on the x-axis and the autocorrelation score is on the y-axis.
#The farther away the autocorrelation is from 0, the greater the influence that records that far away from each other exert on one another.
#Here is what an autocorrelation plot looks like when applied to the Saleprice data:

In [None]:
test_path = '../input/house-prices-advanced-regression-techniques/test.csv' # this is the path to the test data that you will use
Testdata = pd.read_csv(test_path)

#factors that will predict Sale Price
desired_factors = ['1stFlrSF','YearBuilt', 'LotArea', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

#set prediction data to factors that will predict, and set target to SalePrice
train_data = data[desired_factors]
test_data = Testdata[desired_factors]
target = data.SalePrice

#fitting model with prediction data and telling it my target dor the test.csv data
model.fit(train_data, target)

model.predict(test_data)

In [None]:
submit= pd.DataFrame({'Id': Testdata.Id, 'SalePrice': model.predict(test_data)})

submit.to_csv('submitFile.csv', index=False)

In [None]:
x_train = data['HouseStyle']
x_test = Testdata['HouseStyle']
y=data['SalePrice']
from nltk.tokenize import TweetTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,10), max_features=10000, lowercase=True, use_idf=True, smooth_idf=True, sublinear_tf=False, tokenizer=TweetTokenizer().tokenize, stop_words='english')),
                         ('clf', LogisticRegression(random_state=17, C=1.8))])
from sklearn.model_selection import RandomizedSearchCV
parameters = {
               'clf__C': np.logspace(.1,1,10),
 }
gs = RandomizedSearchCV(text_clf, parameters, n_jobs=-1, verbose=3)
text_clf.fit(x_train, y)
predicted = text_clf.predict(x_test)
Testdata['SalePrice'] = predicted

In [None]:
onesubmission = data1[["Id","SalePrice"]]
onesubmission.to_csv("Onesubmission.csv", index = False)

**probabilistic neural network**

In [None]:
import math

def gaussian_pdf1(x, sigma, w):
    return math.exp( -(x - w)**2 / (2 * sigma**2) )

In [None]:
%matplotlib inline
import matplotlib
import numpy as np

import matplotlib.pyplot as plt

sigma = 0.1
w = 0.5
x = np.linspace(w - 2, w + 2, 100)
fig = plt.figure('Fungsi Gaussian')
ax = fig.add_subplot(111)
ax.set_title('Fungsi Gaussian dengan $\sigma = %s, w = %s$' % (sigma, w))
ax.set_xlabel('$x$')
ax.set_ylabel('$f(x; \sigma, w)$')
ax.grid(which='major')
ax.plot(x, [gaussian_pdf1(_, sigma, w) for _ in x])
plt.show()

In [None]:
%matplotlib inline
import matplotlib
import numpy as np

import matplotlib.pyplot as plt

w = 0.5
x = np.linspace(w - 2, w + 2, 100)
fig = plt.figure('Fungsi Gaussian')
ax = fig.add_subplot(111)
ax.set_title('Fungsi Gaussian dengan $\sigma = \{0.1, 0.2, 0.5, 1.0\}; w = %s$' % (w))
ax.set_xlabel('$x$')
ax.set_ylabel('$f(x; \sigma, w)$')
ax.grid(which='major')
ax.plot(x, [gaussian_pdf1(_, 0.1, w) for _ in x], label='$\sigma = 0.1$')
ax.plot(x, [gaussian_pdf1(_, 0.2, w) for _ in x], label='$\sigma = 0.2$')
ax.plot(x, [gaussian_pdf1(_, 0.5, w) for _ in x], label='$\sigma = 0.5$')
ax.plot(x, [gaussian_pdf1(_, 1.0, w) for _ in x], label='$\sigma = 1.0$')
plt.legend()
plt.show()

In [None]:
import math

def gaussian_pdf2(x, sigma, w_j):
    return math.exp(
        -( (x[0] - w_j[0])**2 + (x[1] - w_j[1])**2 ) /
        (2 * sigma**2) )

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

sigma = 0.1
w_j = (0.5, 0.7)


x_0_range = np.linspace(w_j[0] - 3*sigma, w_j[0] + 3*sigma, 100)
x_1_range = np.linspace(w_j[1] - 3*sigma, w_j[1] + 3*sigma, 100)
X_0, X_1 = np.meshgrid(x_0_range, x_1_range)
fs = np.array( [gaussian_pdf2((x_0, x_1), sigma, w_j)
                for x_0, x_1 in zip(np.ravel(X_0), np.ravel(X_1))] )
FS = fs.reshape(X_0.shape)

fig = plt.figure('Fungsi Gaussian dengan 2 variabel')
ax = fig.add_subplot(111, projection='3d')
ax.set_title('$\sigma = %s, w_j = (%s, %s)$' % (sigma, w_j[0], w_j[1]))
ax.set_xlabel('$x_0$')
ax.set_ylabel('$x_1$')
ax.set_zlabel('$f(x_0, x_1; \sigma, w_j)$')
ax.plot_surface(X_0, X_1, FS)
plt.show()

In [None]:
import numpy as np

W = np.array([ (data['SalePrice'][d], data['LotArea'][d]) 
              for d in range(len(data))])
W

In [None]:
sigma = 0.1
x = (0.2, 0.6)
patterns = np.array([ gaussian_pdf2(x, sigma, w_j) for w_j in W ])
patterns

In [None]:
n1 = patterns[0] + patterns[1]
n2 = patterns[2] + patterns[3]
n3 = patterns[4] + patterns[5] + patterns[6]

print('n1 = %s' % n1)
print('n2 = %s' % n2)
print('n3 = %s' % n3)


**If you have any questions or hit any problems, come to the [Learn Discussion](https://www.kaggle.com/learn-forum) for help. **

**Return to [ML Course Index](https://www.kaggle.com/learn/machine-learning)**