# SQL

### Date, time built-in functions
-DATE: YYYYMMDD <br>
-TIME: HHMMSS <br>
-TIMESTAMP: YYYYXXDDHHMMSSZZZZZ <br>

SELECT DAY(saledate)<br>
FROM petsale<br>
WHERE animal='cat';

SELECT COUNT(* ) from petsale<br>
WHERE MONTH(saledate)='05';

SELECT (saledate + 3 DAYS)<br>
FROM petsale;

SELECT (CURRENT_DATE - saledate) <br>
FROM petsale;

### Sub-Queries

SELECT *<br>
FROM table1<br>
WHERE salary < <br>
(  SELECT AVG(salary)<br>
   FROM table1);

#### Column expressions

SELECT emp_id, salary,<br>
(SELECT AVG(salary) FROM employees) as avg_salary<br>
FROM employees;

#### Joins

SELECT * FROM employees, departments<br>
WHERE employees.dep_id = departments.dept_id_dep;

Inner join: intersection

Left outer join: right table side will have none values <br>
Right outer join: Left table side will have none values <br>
Full outer join: Both tables sides will gave none values

#### dbmodule

In [None]:
from dbmodule import connect

# Create the connection
connection = connect('databasename', 'username', 'password')

# Create cursor
cursor = connection.cursor()

# Run queries
cursor.execute('SELECT * FROM mytable')
Results = cursor.fetchall()

# Free resources
cursor.close()
connection.close()

#### ibm_db

In [None]:
!pip install ibm_db

In [None]:
import ibm_db

# First, get credentials from IBM: Db2 -> Service Credentials -> New Credentials ->
dsn_hostname = "dashdb-txn-sbox-yp-lon02-04.services.eu-gb.bluemix.net"
dsn_uid = "rdj81688"
dsn_pwd = "289xd1lw-zfjsj09"
dsn_driver = "{IBM DB2 ODBC DRIVER}"
dsn_database = "BLUDB"
dsn_port = "50000"
dsn_protocol = "TCPIP"

In [None]:
# Create database string
dsn = f"""
    DRIVER={dsn_driver};
    DATABASE={dsn_database};
    HOSTNAME={dsn_hostname};
    PORT={dsn_port};
    PROTOCOL={dsn_protocol};
    UID={dsn_uid};
    PWD={dsn_pwd};
    """

print(dsn)

In [None]:
# Create database connection
try:
    conn = ibm_db.connect(dsn, "", "")
    print ("Connected to database: ", dsn_database, "as user: ", dsn_uid, "on host: ", dsn_hostname)

except:
    print ("Unable to connect: ", ibm_db.conn_errormsg() )

In [None]:
# Retrieve metadata from server
server = ibm_db.server_info(conn)

print ("DBMS_NAME: ", server.DBMS_NAME)
print ("DBMS_VER:  ", server.DBMS_VER)
print ("DB_NAME:   ", server.DB_NAME)

In [None]:
# Retrieve metadata from database client
client = ibm_db.client_info(conn)

print ("DRIVER_NAME:          ", client.DRIVER_NAME) 
print ("DRIVER_VER:           ", client.DRIVER_VER)
print ("DATA_SOURCE_NAME:     ", client.DATA_SOURCE_NAME)
print ("DRIVER_ODBC_VER:      ", client.DRIVER_ODBC_VER)
print ("ODBC_VER:             ", client.ODBC_VER)
print ("ODBC_SQL_CONFORMANCE: ", client.ODBC_SQL_CONFORMANCE)
print ("APPL_CODEPAGE:        ", client.APPL_CODEPAGE)
print ("CONN_CODEPAGE:        ", client.CONN_CODEPAGE)

In [None]:
stmt = ibm_db.exec_immediate(conn, 
"""
SELECT * FROM EMPLOYEES
""")

In [None]:
ibm_db.fetch_both(stmt) 

In [None]:
ibm_db.close(conn)

#### Pandas ibm_db_dbi

In [None]:
import pandas as pd
import ibm_db_dbi
pconn = ibm_db_dbi.Connection(conn)
df = pandas.read_sql('SELECT * FROM table', pconn)

#### Accessing Db2 database with magic method

In [None]:
!pip install ipython-sql

In [None]:
!pip install ibm_db_sa

In [None]:
%load_ext sql

In [None]:
%sql ibm_db_sa://rdj81688:289xd1lw-zfjsj09@dashdb-txn-sbox-yp-lon02-04.services.eu-gb.bluemix.net:50000/BLUDB

In [None]:
%%sql

CREATE TABLE INTERNATIONAL_STUDENT_TEST_SCORES (
	country VARCHAR(50),
	first_name VARCHAR(50),
	last_name VARCHAR(50),
	test_score INT
);
INSERT INTO INTERNATIONAL_STUDENT_TEST_SCORES (country, first_name, last_name, test_score)
VALUES
('United States', 'Marshall', 'Bernadot', 54),
('Australia', 'Eduard', 'Leipelt', 53)

In [None]:
country = 'Canada'
%sql select * from table where country = :country

In [None]:
result = %sql SELECT testscore as "Test Score", count(*) from table ;

#### Pandas, ibm_db and matplotlib some magic

In [None]:
%load_ext sql

In [47]:
import ibm_db_sa

In [None]:
%sql ibm_db_sa://rdj81688:289xd1lw-zfjsj09@dashdb-txn-sbox-yp-lon02-04.services.eu-gb.bluemix.net:50000/BLUDB

In [None]:
# Checking the number of records
%sql SELECT COUNT(*) FROM table;

#### Executing cmd commands

In [None]:
from __future__ import print_function

In [None]:
!pip3 install --upgrade pip

# R Studio

#### Uploading a dataset

In [None]:
df = read.csv('./Documents/my_notes/my_notes/ML/data/my_data.csv', header = False)
View(df)

#### R Packages
cran.r-project.org

#### Installing packages
install.packages('audio')

#### Load a package
library(audio) <br>
play(sin(1:10000/20))

#### Plotting

In [None]:
x = rnorm(100)
y = rnorm(100, sd = 10)

df = data.frame(x, y)
view(df)

library(ggplot2)
ggplot(df, aes(x = weight, y = height)) + geom_point()

# Jupyter notebooks in watson

You can create jobs, import other code from other jupyter notebooks etc

# Data analysis with Python

In [None]:
# Import a .csv file without a header
import pandas as pd
url = 'https://...'
df = pd.read_csv(url, header = None)

In [None]:
# Replace headers
headers = ['Col1', 'Col2', 'Col3']
df.columns = headers

Pandas also read:
- json files
- excel files
- sql files

In [None]:
df.describe() # works for numerical columns
df.describe(include='all') # for all columns

#### Handle missing values

In [None]:
# Doesn't change the dataframe
df.dropna(subset=['col1'], axis=0)
# Changes the dataframe
df.dropna(subset=['col1'], axis=0, inplace=True)

In [None]:
# Replace the missing values
mean_val = df['target_column'].mean()
df.replace(np.nan, mean_val)

#### Data Normalization

In [None]:
# Simple scale
df['target'] = df['target']/df['target'].max()
# Min-Max
df['target'] = (df['target'] - df['target'].min())/(df['target'].max() - df['target'].min())
# Z score
df['target'] = (df['target'] - df['target'].mean())/df['target'].std() # Bewtween ( -3 +3)

#### Binning - grouping values into bins

In [None]:
bins = np.linspace(min(df['price']), max(df['price']), 4)
group_names = ['Low','Medium','High']
df['priced_bins'] = pd.cut(df['price'], bins, labels = group_names, include_lowest=True)

In [None]:
# Transforming categorical data into 1 and 0
pd.get_dummies(df['target'])

#### Descriptive statistics

In [None]:
pandas.describe()
# Box plots: good to observe outliers
sns.boxplot(x='engine', y='price', data=df)
# Scatterplots
x = df['price]
y = df['engine']
plt.scatter(x, y)
plt.title('Title')
plt.xlabel('X Axis')
plt.ylabel('Y Axis')

#### Grouping by

In [None]:
new_df = df[['col1','col2','col3']]
grouped_df = new_df.groupby(['col1', 'col2'], as_index=False).mean()
df.pivot(index='col1', columns=['col2'])

#### Corelation(pearson coefficient) and p-value

Correlation coefficient: <br>
- +1 large positive relashionship
- -1 large negative relashionship
- 0  no relashionship <br>

P-value: <br>
- p-value < 0.001 Strong certainty in the result
- p-value < 0.05 Moderate certainty in the result
- p-value < 0.1 Weak certainty in the result
- p-value > 0.1 No certainty in the result

e.g of strong correlation: cc = 1/-1 and p-value < 0.001<br>
Correlation heatmap shows all the correlations, with 1 on 2nd diagonal

In [None]:
pearson_coef, p_value = stats.pearsonr(df['col1'], df['col2'])

In [None]:
# Find a correlation and ilustrate it
df[['col1', 'col2']].corr()
sns.regplot(x='col1', y='col2', data=df)

#### ANOVA - Analysis of Variance

1. F score : difference between mediums. Difference between mediums is then compared with the variance of a class
2. p-value


In [None]:
from scipy import stats
df_anova = df[['brand', 'price']]
grouped_anova = df_anova.groupby(['make'])
anova_results = stats.f_oneway(grouped_anova.get_group('honda')['price'], grouped_anova.get_group('subaru')['price'])

In [None]:
# To count categorical values
df['col1'].value_counts().to_frame()

In [None]:
# Create a new column with the value counts
df_counts = df['target'].value_counts().to_frame()
df_counts.rename(columns={'col1':'col2'}, inplace=True)
df_counts.index.name='index_name'

Different types of regression:
- Simple linear regression (SLR)
- Multiple linear regression (MLR)
- Polynomial regression (PR)

In [None]:
# SLR
from sklearn.linear_model import LinearRegression

slr = LinearRegression()
X = ...
y = ...
slr.fit(X, y)
y_hat = slr.predict(z)
slr.intercept_ # b0/intercept
slr.coef_ # b1/slope

In [None]:
# MLR
X = df[['var1', 'var2', 'var3', 'var4']]
mlr.fit(Z, y)

In [None]:
# PR

# Visualization function
def PlotPolly(model, independent_variable, dependent_variabble, Name):
    x_new = np.linspace(15, 55, 100)
    y_new = model(x_new)

    plt.plot(independent_variable, dependent_variabble, '.', x_new, y_new, '-')
    plt.title('Polynomial Fit with Matplotlib for Price ~ Length')
    ax = plt.gca()
    ax.set_facecolor((0.898, 0.898, 0.898))
    fig = plt.gcf()
    plt.xlabel(Name)
    plt.ylabel('Price of Cars')

    plt.show()
    plt.close()

# Values
x = df['highway-mpg']
y = df['price']

# Here we use a polynomial of the 3rd order (cubic) 
f = np.polyfit(x, y, 3)
p = np.poly1d(f)
print(p)

# Plotting
PlotPolly(p, x, y, 'highway-mpg')

#### Creating a rezidual plot ( to see how the error is, if a curved line is needed)

In [None]:
import seaborn as sns
sns.regplot(x = 'x-axis', y='y-axis', data = df)
sns.residplot(df['x-axis'], df['y-axis'])

#### Creating a distribution plot

In [None]:
ax1 = sns.displot(df['price'], hist=False, color='r', label='Actual Value')
sns.displot(Yhat, hist=False, color="b", label="Fitted Values", ax=ax1)

Pipelines
data -> Normalization -> Polynomial Transformation -> Linear Regression -> prediction

#### Ways to determine the efficiency of a model:
1. MSE: mean squared error:<br>
from sklearn import mean_squared_error <br>
mean_squared_error(actual, predicted)<br>
2. Rˆ2: R squared:<br>
slr.score(actual, predicted)<br>

In [None]:
# Splitting dataset
from sklearn.model_selection import train_test_split
# Doing Cross validation (4 folds, each fold will be test subset)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(slr, x_data, y_data, cv=3) # dataset split into 3 different partitions

#### Ridge regression ( when using higher lvl polinomial regression, to deny outliers)

In [None]:
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=0.1)

#### Grid search (used to choose the parameters for the model)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

parameters = [{'alpha':[0.001, 0.01, 1, 10, 1000], 'normalize':[True, False]}]
ridge_model = Ridge()
Grid1 = GridSearchCV(ridge_model, parameters, cv=4)
Grid1.fit(X, y)
Grid1.best_estimator_
scores = Grid1.cv_results_

# Data Visualization

In [None]:
%matplotlib inline
# %matplotlib notebook - interactive
import matplotlib.pyplot as plt

plt.plot(5, 5, 'o')
plt.title('Plotting Example')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

#### Direct plot function in pandas module

In [None]:
# Reading data from an excel
# !pip install xlrd
# df = pd.read_excel('https://...', sheet_name='dsad', skiprows=range(10), skipfooter=2)

In [None]:
import pandas as pd

df_can = pd.read_csv('canada.csv')
df_can.drop(columns=[' ' ,'Type', 'Coverage', 'AREA', 'REG', 'RegName', 'DEV', 'DevName'], axis=0, inplace=True)
df_can.rename(columns={'OdName':'Country', 'AreaName': 'Continent'}, inplace=True)
df_can = df_can.set_index('Country')
# df_can = df_can.T
# df_can = df_can[['China', 'India']]
df_can['Total'] = df_can.sum(axis=1)
df_can.tail()

In [None]:
# df_can.plot(kind='line')
df_can.loc['Haiti', list(map(str, range(1980, 2014)))].plot(kind='line')

In [None]:
# df_can['India'].plot(kind='hist')
df_can.loc['Haiti', list(map(str, range(1980, 2014)))].plot(kind='hist')

In [None]:
import numpy as np
count, bin_edges = np.histogram(df_can['2013'])

df_can['2013'].plot(kind='hist', figsize=(8, 5), xticks=bin_edges)

In [None]:
df_t = df_can.loc[['Denmark', 'Norway', 'Sweden'], list(map(str, range(1980, 2014)))].transpose()

count, bin_edges = np.histogram(df_t, 15)
xmin = bin_edges[0] - 10   #  first bin value is 31.0, adding buffer of 10 for aesthetic purposes 
xmax = bin_edges[-1] + 10  #  last bin value is 308.0, adding buffer of 10 for aesthetic purposes

# stacked Histogram
df_t.plot(kind='hist',
          figsize=(10, 6), 
          bins=15,
          xticks=bin_edges,
          color=['coral', 'darkslateblue', 'mediumseagreen'],
          stacked=True,
          xlim=(xmin, xmax)
         )

#### Area plots

In [None]:
df_can.sort_values(['Total'], ascending = False, axis=0, inplace=True)

In [None]:
df_top = df_can.head()
df_top = df_top[list(map(str, range(1980, 2014)))].transpose()
df_top.head()

In [None]:
df_top.plot(kind='area',figsize=(20, 10))

In [None]:
df_top.index = df_top.index.map(int) # let's change the index values of df_top5 to type integer for plotting
df_top.plot(kind='area', 
             alpha=0.25, # 0-1, default value a= 0.5
             stacked=False,
             figsize=(20, 10),
            )

#### Barchart

In [None]:
df_iceland = df_can.loc['Iceland', list(map(str, range(1980, 2014)))]
df_iceland.plot(kind='bar')

In [None]:
df_iceland.plot(kind='bar', figsize=(10, 6), rot=90) 

plt.xlabel('Year')
plt.ylabel('Number of Immigrants')
plt.title('Icelandic Immigrants to Canada from 1980 to 2013')

# Annotate arrow
plt.annotate('',                      # s: str. will leave it blank for no text
             xy=(32, 70),             # place head of the arrow at point (year 2012 , pop 70)
             xytext=(28, 20),         # place base of the arrow at point (year 2008 , pop 20)
             xycoords='data',         # will use the coordinate system of the object being annotated 
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='blue', lw=2)
            )

# Annotate Text
plt.annotate('2008 - 2011 Financial Crisis', # text to display
             xy=(28, 30),                    # start the text at at point (year 2008 , pop 30)
             rotation=72.5,                  # based on trial and error to match the arrow
             va='bottom',                    # want the text to be vertically 'bottom' aligned
             ha='left',                      # want the text to be horizontally 'left' algned.
            )

#### Pie charts

In [None]:
df_cont = df_can.groupby('Continent', axis=0).sum()
df_cont

In [None]:
df_cont['Total'].plot(kind='pie')

In [None]:
colors_list = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue', 'lightgreen', 'pink']
explode_list = [0.1, 0, 0, 0, 0.1, 0.1] # ratio for each continent with which to offset each wedge.

df_cont['Total'].plot(kind='pie',
                            figsize=(15, 6),
                            autopct='%1.1f%%', 
                            startangle=90,    
                            shadow=True,       
                            labels=None,         # turn off labels on pie chart
                            pctdistance=1.12,    # the ratio between the center of each pie slice and the start of the text generated by autopct 
                            colors=colors_list,  # add custom colors
                            explode=explode_list # 'explode' lowest 3 continents
                            )

# scale the title up by 12% to match pctdistance
plt.title('Immigration to Canada by Continent [1980 - 2013]', y=1.12) 

plt.axis('equal') 

# add legend
plt.legend(labels=df_cont.index, loc='upper left') 

plt.show()

#### Box plots
Useful to detect outliers

In [None]:
import matplotlib.pyplot as plt

df_japan = df_can.loc[['Japan'], list(map(str, range(1980, 2014)))].transpose()
df_japan.plot(kind='box', figsize=(10, 7), color='blue', vert=True)

#### Sub plots

In [None]:
fig = plt.figure() # create figure

ax0 = fig.add_subplot(1, 2, 1) # add subplot 1 (1 row, 2 columns, first plot)
ax1 = fig.add_subplot(1, 2, 2) # add subplot 2 (1 row, 2 columns, second plot). See tip below**

# Subplot 1: Box plot
df_japan.plot(kind='box', color='blue', vert=False, figsize=(20, 6), ax=ax0) # add to subplot 1
ax0.set_title('Box Plots of Immigrants from China and India (1980 - 2013)')
ax0.set_xlabel('Number of Immigrants')
ax0.set_ylabel('Countries')

# Subplot 2: Line plot
df_japan.plot(kind='line', figsize=(20, 6), ax=ax1) # add to subplot 2
ax1.set_title ('Line Plots of Immigrants from China and India (1980 - 2013)')
ax1.set_ylabel('Number of Immigrants')
ax1.set_xlabel('Years')

plt.show()

#### Scatter plot

In [None]:
df_tot = pd.DataFrame(df_can[list(map(str, range(1980, 2014)))].sum(axis=0))
df_tot.index = map(int, df_tot.index)
df_tot.reset_index(inplace = True)
df_tot.columns = ['year', 'total']
df_tot.head()

In [None]:
df_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6), color='darkblue')

plt.title('Total Immigration to Canada from 1980 - 2013')
plt.xlabel('Year')
plt.ylabel('Number of Immigrants')

plt.show()

In [None]:
x = df_tot['year']      # year on x-axis
y = df_tot['total']     # total on y-axis
fit = np.polyfit(x, y, deg=1)

fit

In [None]:
df_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6), color='darkblue')

plt.title('Total Immigration to Canada from 1980 - 2013')
plt.xlabel('Year')
plt.ylabel('Number of Immigrants')

# plot line of best fit
plt.plot(x, fit[0] * x + fit[1], color='red') # recall that x is the Years
plt.annotate('y={0:.0f} x + {1:.0f}'.format(fit[0], fit[1]), xy=(2000, 150000))

plt.show()

# print out the line of best fit
'No. Immigrants = {0:.0f} * Year + {1:.0f}'.format(fit[0], fit[1]) 

#### Bubble plots

In [None]:
df_can_t = df_can[list(map(str, range(1980, 2014)))].transpose() # transposed dataframe

# cast the Years (the index) to type int
df_can_t.index = map(int, df_can_t.index)

# let's label the index. This will automatically be the column name when we reset the index
df_can_t.index.name = 'Year'

# reset index to bring the Year in as a column
df_can_t.reset_index(inplace=True)

# view the changes
df_can_t.head()

In [None]:
# normalize Brazil data
norm_brazil = (df_can_t['Brazil'] - df_can_t['Brazil'].min()) / (df_can_t['Brazil'].max() - df_can_t['Brazil'].min())

# normalize Argentina data
norm_argentina = (df_can_t['Argentina'] - df_can_t['Argentina'].min()) / (df_can_t['Argentina'].max() - df_can_t['Argentina'].min())

In [None]:
# Brazil
ax0 = df_can_t.plot(kind='scatter',
                    x='Year',
                    y='Brazil',
                    figsize=(14, 8),
                    alpha=0.5,                  # transparency
                    color='red',
                    s=norm_brazil * 2000 + 10,  # pass in weights 
                    xlim=(1975, 2015)
                   )

# Argentina
ax1 = df_can_t.plot(kind='scatter',
                    x='Year',
                    y='Argentina',
                    alpha=0.5,
                    color="yellow",
                    s=norm_argentina * 2000 + 10,
                    ax = ax0
                   )

ax0.set_ylabel('Number of Immigrants')
ax0.set_title('Immigration from Brazil and Argentina from 1980 - 2013')
ax0.legend(['Brazil', 'Argentina'], loc='upper left', fontsize='x-large')

#### Waffle chart

In [None]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches # needed for waffle Charts

mpl.style.use('ggplot') # optional: for ggplot-like style

# check for latest version of Matplotlib
print ('Matplotlib version: ', mpl.__version__) # >= 2.0.0

df_dsn = df_can.loc[['Denmark', 'Norway', 'Sweden'], :]

def create_waffle_chart(categories, values, height, width, colormap, value_sign=''):

    # compute the proportion of each category with respect to the total
    total_values = sum(values)
    category_proportions = [(float(value) / total_values) for value in values]

    # compute the total number of tiles
    total_num_tiles = width * height # total number of tiles
    print ('Total number of tiles is', total_num_tiles)
    
    # compute the number of tiles for each catagory
    tiles_per_category = [round(proportion * total_num_tiles) for proportion in category_proportions]

    # print out number of tiles per category
    for i, tiles in enumerate(tiles_per_category):
        print (df_dsn.index.values[i] + ': ' + str(tiles))
    
    # initialize the waffle chart as an empty matrix
    waffle_chart = np.zeros((height, width))

    # define indices to loop through waffle chart
    category_index = 0
    tile_index = 0

    # populate the waffle chart
    for col in range(width):
        for row in range(height):
            tile_index += 1

            # if the number of tiles populated for the current category 
            # is equal to its corresponding allocated tiles...
            if tile_index > sum(tiles_per_category[0:category_index]):
                # ...proceed to the next category
                category_index += 1       
            
            # set the class value to an integer, which increases with class
            waffle_chart[row, col] = category_index
    
    # instantiate a new figure object
    fig = plt.figure()

    # use matshow to display the waffle chart
    colormap = plt.cm.coolwarm
    plt.matshow(waffle_chart, cmap=colormap)
    plt.colorbar()

    # get the axis
    ax = plt.gca()

    # set minor ticks
    ax.set_xticks(np.arange(-.5, (width), 1), minor=True)
    ax.set_yticks(np.arange(-.5, (height), 1), minor=True)
    
    # add dridlines based on minor ticks
    ax.grid(which='minor', color='w', linestyle='-', linewidth=2)

    plt.xticks([])
    plt.yticks([])

    # compute cumulative sum of individual categories to match color schemes between chart and legend
    values_cumsum = np.cumsum(values)
    total_values = values_cumsum[len(values_cumsum) - 1]

    # create legend
    legend_handles = []
    for i, category in enumerate(categories):
        if value_sign == '%':
            label_str = category + ' (' + str(values[i]) + value_sign + ')'
        else:
            label_str = category + ' (' + value_sign + str(values[i]) + ')'
            
        color_val = colormap(float(values_cumsum[i])/total_values)
        legend_handles.append(mpatches.Patch(color=color_val, label=label_str))

    # add legend to chart
    plt.legend(
        handles=legend_handles,
        loc='lower center', 
        ncol=len(categories),
        bbox_to_anchor=(0., -0.2, 0.95, .1)
    )

width = 40 # width of chart
height = 10 # height of chart

categories = df_dsn.index.values # categories
values = df_dsn['Total'] # correponding values of categories

colormap = plt.cm.coolwarm # color map class

create_waffle_chart(categories, values, height, width, colormap)

In [None]:
fp = pd.read_csv('players_20.csv')
pd.set_option('display.max_columns', 500)
fp = fp[['short_name', 'age', 'height_cm', 'nationality', 'club', 'overall', 'value_eur', 'wage_eur', 'team_position', 'pace', 'dribbling']]
fp.sort_values(['dribbling'], ascending = False, axis=0, inplace=False).head(10)

#### World cloud

In [None]:
# install wordcloud
!conda install -c conda-forge wordcloud==1.4.1 --yes

# import package and its set of stopwords
from wordcloud import WordCloud, STOPWORDS

print ('Wordcloud is installed and imported!')

In [None]:
# download file and save as alice_novel.txt
!wget --quiet https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DV0101EN/labs/Data_Files/alice_novel.txt

# open the file and read it into a variable alice_novel
alice_novel = open('alice_novel.txt', 'r').read()
    
print ('File downloaded and saved!')

#### Next, let's use the stopwords that we imported from `word_cloud`. We use the function *set* to remove any redundant stopwords.

In [None]:
stopwords = set(STOPWORDS)

#### Create a word cloud object and generate a word cloud. For simplicity, let's generate a word cloud using only the first 2000 words in the novel.

In [None]:
# instantiate a word cloud object
alice_wc = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)

# generate the word cloud
alice_wc.generate(alice_novel)

In [None]:
# display the word cloud
fig = plt.figure()
fig.set_figwidth(14) # set width
fig.set_figheight(18) # set height

# display the cloud
plt.imshow(alice_wc, interpolation='bilinear')
plt.axis('off')
plt.show()

#### Much better! However, **said** isn't really an informative word. So let's add it to our stopwords and re-generate the cloud.

In [None]:
stopwords.add('said') # add the words said to stopwords

# re-generate the word cloud
alice_wc.generate(alice_novel)

# display the cloud
fig = plt.figure()
fig.set_figwidth(14) # set width
fig.set_figheight(18) # set height

plt.imshow(alice_wc, interpolation='bilinear')
plt.axis('off')
plt.show()

#### Let's use a mask of Alice and her rabbit. We already created the mask for you, so let's go ahead and download it and call it alice_mask.png.

In [None]:
# download image
!wget --quiet https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DV0101EN/labs/Images/alice_mask.png
    
# save mask to alice_mask
alice_mask = np.array(Image.open('alice_mask.png'))
    
print('Image downloaded and saved!')

#### Let's take a look at how the mask looks like.

In [None]:
fig = plt.figure()
fig.set_figwidth(14) # set width
fig.set_figheight(18) # set height

plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis('off')
plt.show()

#### Shaping the word cloud according to the mask is straightforward using word_cloud package. For simplicity, we will continue using the first 2000 words in the novel.

In [None]:
# instantiate a word cloud object
alice_wc = WordCloud(background_color='white', max_words=2000, mask=alice_mask, stopwords=stopwords)

# generate the word cloud
alice_wc.generate(alice_novel)

# display the word cloud
fig = plt.figure()
fig.set_figwidth(14) # set width
fig.set_figheight(18) # set height

plt.imshow(alice_wc, interpolation='bilinear')
plt.axis('off')
plt.show()

#### On the Canada Dataset

In [None]:
total_immigration = df_can['Total'].sum()
total_immigration

In [None]:
import numpy as np
max_words = 90
word_string = ''
for country in df_can.index.values:
    # check if country's name is a single-word name
    if len(country.split(' ')) == 1:
        repeat_num_times = int(df_can.loc[country, 'Total']/float(total_immigration)*max_words)
        word_string = word_string + ((country + ' ') * repeat_num_times)
                                     
# display the generated text
word_string

In [None]:
# create the word cloud
wordcloud = WordCloud(background_color='white').generate(word_string)

print('Word cloud created!')

In [None]:
# display the cloud
fig = plt.figure()
fig.set_figwidth(14)
fig.set_figheight(18)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

#### Seaborn regression plots

Create a new dataframe that stores that total number of landed immigrants to Canada per year from 1980 to 2013.

In [None]:
# we can use the sum() method to get the total population per year
df_tot = pd.DataFrame(df_can[list(map(str, range(1980, 2014)))].sum(axis=0))

# change the years to type float (useful for regression later on)
df_tot.index = map(float, df_tot.index)

# reset the index to put in back in as a column in the df_tot dataframe
df_tot.reset_index(inplace=True)

# rename columns
df_tot.columns = ['year', 'total']

# view the final dataframe
df_tot.head()

Generating a regression plot is as simple as calling the regplot function.

In [None]:
import seaborn as sns
ax = sns.regplot(x='year', y='total', data=df_tot)

Let's change the color to green

In [None]:
import seaborn as sns
ax = sns.regplot(x='year', y='total', data=df_tot, color='green')

You can always customize the marker shape, so instead of circular markers, let's use '+'.

In [None]:
import seaborn as sns
ax = sns.regplot(x='year', y='total', data=df_tot, color='green', marker='+')

Let's blow up the plot a little bit so that it is more appealing to the sight.

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.regplot(x='year', y='total', data=df_tot, color='green', marker='+')

And let's increase the size of markers so they match the new size of the figure, and add a title and x- and y-labels.

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.regplot(x='year', y='total', data=df_tot, color='green', marker='+', scatter_kws={'s': 200})

ax.set(xlabel='Year', ylabel='Total Immigration') # add x- and y-labels
ax.set_title('Total Immigration to Canada from 1980 - 2013') # add title

And finally increase the font size of the tickmark labels, the title, and the x- and y-labels so they don't feel left out!

In [None]:
plt.figure(figsize=(15, 10))

sns.set(font_scale=1.5)

ax = sns.regplot(x='year', y='total', data=df_tot, color='green', marker='+', scatter_kws={'s': 200})
ax.set(xlabel='Year', ylabel='Total Immigration')
ax.set_title('Total Immigration to Canada from 1980 - 2013')

Change the style to a white plain background.

In [None]:
plt.figure(figsize=(15, 10))

sns.set(font_scale=1.5)
sns.set_style('ticks') # change background to white background

ax = sns.regplot(x='year', y='total', data=df_tot, color='green', marker='+', scatter_kws={'s': 200})
ax.set(xlabel='Year', ylabel='Total Immigration')
ax.set_title('Total Immigration to Canada from 1980 - 2013')

Or to a white background with gridlines.

In [None]:
plt.figure(figsize=(15, 10))

sns.set(font_scale=1.5)
sns.set_style('whitegrid')

ax = sns.regplot(x='year', y='total', data=df_tot, color='green', marker='+', scatter_kws={'s': 200})
ax.set(xlabel='Year', ylabel='Total Immigration')
ax.set_title('Total Immigration to Canada from 1980 - 2013')

#### Folium

In [None]:
!conda install -c conda-forge folium=0.5.0 --yes

In [None]:
import folium
world_map = folium.Map(location=[56, -106], zoom_start=4)
world_map

### Regression
Two types of testing:
- test on a potion of the train set (a.)
- train/test split (b.)
- k-fold cross validation (c.)
# IMAG

Evaluation of the model (metrics):
- MSE: mean squared error
- RMSE: root mean squared error
- RSE: sum(yi - 'yi)ˆ2 / sum(yi - mean(y))ˆ2
- Rˆ2: 1 - RSE