# Reference code for Data Mining Exam

Warning: none of this code is runnable in its current form.  It only exists to jog your memory or save you time.

### Import statements

In [1]:
from itertools import combinations, chain
from statistics import mean, median

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score

%matplotlib inline

plt.style.use("fivethirtyeight")

### Dataframe i/o

In [45]:
df = pd.read_csv("data/test.csv", index_col=0) # Possibly change `index_col`, `sep`

# Set columns:
#df.columns = ['column1', 'column2']
df.head()

Unnamed: 0,default,student,balance,income,rent
1,No,No,729.526495,44361.625074,1382.367143
2,No,Yes,817.180407,12106.1347,677.912047
3,No,No,1073.549164,31767.138947,1589.497127
4,No,No,529.250605,35704.493935,1245.529318
5,No,No,785.655883,38463.495879,1195.162718


### Tweaking a dataframe

In [46]:
# Set an individual entry of the dataframe, where the pair `[index, 'col_name']` are the row/column info, respectively
#df.loc[index, 'col_name'] = <new_value>

# Finding NaN values
#df[df.isnull().any(axis=1)]

# How many rows have a negative `column1`?
df = df[df['rent'] >= 0]

# Which rows have `column1` values in a certain list of possibilities?
#df[df['column1'].isin(["Turkey", "Bulgaria", "Italy"])]

# Basic slicing
#df[['column1', 'column2']]     # by column
#df[15:42]                      # by row 

# Adding a new column
#df['new_col'] = np.random.randn(df.shape[0])
df[df['income'] == -1].count()

replacer = lambda x: x if not x == -1 else df["income"].median()

r_dict = {" student": 1, "no_student": 0, "Yes": 1, "No": 0}

df["income"] = df["income"].map(replacer)
df["student"] = df["student"].map(r_dict)
df["default"] = df["default"].map(r_dict)

In [47]:
df.head()
df.describe()

Unnamed: 0,default,student,balance,income,rent
count,9994.0,9994.0,9994.0,9994.0,9994.0
mean,0.03332,0.293976,835.292565,33537.40844,1377.000441
std,0.17948,0.455604,483.769635,13292.705972,432.981012
min,0.0,0.0,0.0,2541.200814,8.111088
25%,0.0,0.0,481.637704,21381.793659,1070.683359
50%,0.0,0.0,823.540575,34511.419087,1378.638367
75%,0.0,1.0,1166.059303,43772.048741,1681.595983
max,1.0,1.0,2654.322576,73554.233495,2921.464501


In [59]:
X = df[["student","balance", "income", "rent"]]
y = df["default"]
model = LogisticRegression()
model.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [66]:
preds = model.predict(X)
confusion_matrix(y,preds)

array([[9623,   38],
       [ 327,    6]])

### Misc comprehension magic

In [None]:
# What attributes does this object have?
object_ = <object_instance>
print(*[item for item in dir(object_) if item[0] != "_"], sep='\n')

# What are all your parameter options for various models to fit?
parameters = ['x_1', 'x_2', 'x_3', 'x_4']

param_options = [model for group in chain(combinations(parameters,n) for n in range(len(parameters) + 1)) 
                       for model in group if model]

# Plotting

### Basic:

In [None]:
# Subplots (multiple figures at once)
figure = plt.figure() # You can set `figsize` here if you want
ax = figure.add_subplot(number_of_rows, number_of_columns, which_subplot_is_this) ## e.g. (2,2,1) through (2,2,4)

# Scatter plot
plt.scatter(df['x_axis_col'], df['y_axis_col'], 
            c=df['color_column'].map({'entry_1':'r', 
                                      'entry_2':'b', 
                                      'entry_3':'g'}))
                 
# Line plot (for curves, use many more points in your lists)
plt.plot([x1, x2, x3],     # These are lists of (x,y) ordered pairs...
         [y1, y2, y3],     # ... they will be connected by lines
         c='k',            #black
         linewidth=2)

# Histogram
plt.hist(df.balance, bins=8)

# For any of the above, end with:
plt.show()

### Contour

In [None]:
#### SET THESE ####
# This should be an (n x p) numpy ndarray, consisting of your data.  This
# will NOT create your scatter plot, you'll need to do that yourself.
X_data = df[['x_1', 'x_2']].values

x_axis_name = 'x_1'
y_axis_name = 'x_2'

# CONSTANTS
GRANULARITY = 50

# Create a mesh
x1_min, x1_max, x2_min, x2_max = (df[x_axis_name].min(),
                                  df[x_axis_name].max(),
                                  df[y_axis_name].min(),
                                  df[y_axis_name].max())

xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max, GRANULARITY),
                       np.linspace(x2_min, x2_max, GRANULARITY))

# In order to produce the prediction values, we `ravel` the xx's and yy's.
# This means we turn the grid of values into a single column, which we later `reshape`.
xx1_col = xx1.ravel()
xx2_col = xx2.ravel()

########################################## SADLY, YOU NEED TO DO THIS ##########################################
# This matrix needs to consist of exactly what your X matrix consisted of, where "xx1_col" replaces `x_1`,
# and "xx2_col" replaces "x_2".  IF YOU HAVE OTHER COLUMNS, YOU NEED TO ESTIMATE THESE EITHER WITH MEDIANS OR 
# BY SOME FUNCTION OF YOUR x- OR y-AXIS COLUMNS.  Here's an example of filling with a median:
# xx3_col = np.zeros(xx1_col.shape) + df['x_3'].median()
X_columns = np.c_[xx1_col,xx2_col]

#### PLOTTING ####
################################### CHANGE THE NAME OF YOUR MODEL HERE IF NEEDED ###############################
yy = model.predict_proba(X_columns).T[0]
yy = yy.reshape(xx1.shape)

#### PLOTTING ####

figure = plt.figure() # You can set `figsize` here if you want
ax = figure.add_subplot(1,1,1)
# Set plot limits
ax.set_xlim(x1_min, x1_max)
ax.set_ylim(x2_min, x2_max)
ax.set_xlabel('Always label your axes!')
ax.set_ylabel('Always label your axes!')

## Uncomment whichever of these you want: ##
# Just the decision boundary, where:
#     - `levels` is what determines the actual value (or values) of the boundary, and
#     - `colors` is the color of each level above
"""
ax.contour(xx1, xx2, yy, levels=[.5], colors=['c'])
"""

# The entire decision surface, where:
#     - `100`  refers to the number of contours to plot, set it to more for a smoother look
#     - `cmap` is the colormap we're using.  Options: http://matplotlib.org/users/colormaps.html
#     - `vmin` and `vmax` are the low and high output values to plot
"""
contour = ax.contourf(xx1, xx2, yy, 100, cmap="RdBu",
                      vmin=0, vmax=1)
ax_c = figure.colorbar(contour) # Adds a colorbar, which is like a continuous legend
ax_c.set_label("$P(y = 1)$") # Labels the above colorbar
"""

# Fix up your scatter plot here:
ax.scatter(df[x_axis_name],df[y_axis_name])  # set c= to color your points based on their `y` value

plt.show()

The [Adjusted $R^2$ formula](https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2).