In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

# load dataset
df = pd.read_csv(r"C:\Users\Administrator\Desktop\datasets\worldstat\avocado.csv")

# basic exploration for shape and NaNs
df.columns

Index(['Unnamed: 0', 'Date', 'AveragePrice', 'Total Volume', '4046', '4225',
       '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type',
       'year', 'region'],
      dtype='object')


In [2]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.head())

   Unnamed: 0        Date  AveragePrice  Total Volume     4046       4225  \
0           0  2015-12-27          1.33      64236.62  1036.74   54454.85   
1           1  2015-12-20          1.35      54876.98   674.28   44638.81   
2           2  2015-12-13          0.93     118220.22   794.70  109149.67   
3           3  2015-12-06          1.08      78992.15  1132.00   71976.41   
4           4  2015-11-29          1.28      51039.60   941.48   43838.39   

     4770  Total Bags  Small Bags  Large Bags  XLarge Bags          type  \
0   48.16     8696.87     8603.62       93.25          0.0  conventional   
1   58.33     9505.56     9408.07       97.49          0.0  conventional   
2  130.50     8145.35     8042.21      103.14          0.0  conventional   
3   72.58     5811.16     5677.40      133.76          0.0  conventional   
4   75.78     6183.95     5986.26      197.69          0.0  conventional   

   year  region  
0  2015  Albany  
1  2015  Albany  
2  2015  Albany  
3  2015 

In [None]:
df.info()

In [None]:
df.isnull().any()

In [None]:
df = df.drop(df.columns[:2],1)
df.head()

In [None]:
# checkout autocorrelation matrix for redundant data
corr = df.corr()
plt.subplots(figsize=(8,8))
sns.heatmap(corr, annot=True)
plt.show()

In [None]:
# replace column name so it doesn't get overriden by Python
df.columns = df.columns.str.replace('type','label')

In [None]:
# setup dummy variables for label columns
label = pd.get_dummies(df.label).iloc[:,1:]
year = pd.get_dummies(df.year).iloc[:,1:]
region = pd.get_dummies(df.region).iloc[:,1:]
df = pd.concat([df, label, year, region], 1)
df = df.drop(['label', 'year', 'region'],1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC, SVR
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNetCV

In [None]:
# setup label
X = df.drop('organic', 1)
y = df['organic']

# split data for the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

# normalize data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# classifiers to run and check
classifiers = [['DecTree :',DecisionTreeRegressor()],
               ['RandFor :',RandomForestRegressor()],
               ['KNeighb :', KNeighborsRegressor(n_neighbors = 5)],
               ['SVRegre :', SVR()],
               ['SVClass :', SVC()],
               ['GBClass :', GradientBoostingRegressor()],
               ['ExTRegr :', ExtraTreesRegressor(n_estimators=1, min_samples_split=5)],
               ['ElNetCV :', ElasticNetCV()]]

In [None]:
# print out benchmark functions for classifiers
print("\n====== RMSE ======")
for name,classifier in classifiers:
    classifier = classifier
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    print(name, (np.sqrt(mean_squared_error(y_test, predictions))))

In [None]:
print("\n====== R^2 ======")
for name,classifier in classifiers:
    print(name, (classifier.score(X_test, y_test)))

In [None]:
# fit a selected classifier
classifier = RandomForestRegressor(n_estimators=1)
classifier.fit(X_train, y_train)

In [None]:
# run prediction..
y_pred = classifier.predict(X_test)

In [3]:
# ..and check it out on an interpreter matrix instead of seaborn
print("\n====== Confusion Matrix ======")
##print(confusion_matrix(y_test,y_pred))
print(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))




NameError: name 'y_test' is not defined