# Linear Regression using Dask Data Frames

This post includes code from [Scalable-Data-Analysis-in-Python-with-Dask](https://github.com/PacktPublishing/-Scalable-Data-Analysis-in-Python-with-Dask/tree/master/Section%202) and [coiled-examples](https://github.com/coiled/coiled-examples).

In [1]:
import numpy as np
import dask.array as da
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine
import sqlite3
import pandas as pd

In [2]:
engine = db.create_engine("sqlite:///fiscal.db")
connection = engine.connect()
metadata = db.MetaData()

In [6]:
#engine.execute("SELECT * FROM fiscal_data LIMIT 1").fetchall()

In [3]:
sql = """
SELECT year
, region
, province
, gdp
, fdi
, it
, specific
FROM fiscal_table
"""

cnxn = connection

In [4]:
df = pd.read_sql(sql, cnxn)

In [10]:
from dask.distributed import Client

client = Client(processes=False, threads_per_worker=2,
                n_workers=3, memory_limit='4GB')
client

In [9]:
client.restart()

0,1
Client  Scheduler: inproc://192.168.1.71/9672/30  Dashboard: http://192.168.1.71:46451/status,Cluster  Workers: 3  Cores: 6  Memory: 12.00 GB


In [11]:
from dask import dataframe as dd 

In [12]:
ddf = dd.from_pandas(df, npartitions=5)

In [13]:
print(ddf)

Dask DataFrame Structure:
                year  region province      gdp    fdi     it specific
npartitions=5                                                        
0              int64  object   object  float64  int64  int64  float64
72               ...     ...      ...      ...    ...    ...      ...
...              ...     ...      ...      ...    ...    ...      ...
288              ...     ...      ...      ...    ...    ...      ...
359              ...     ...      ...      ...    ...    ...      ...
Dask Name: from_pandas, 5 tasks


In [14]:
ddf.head()

Unnamed: 0,year,region,province,gdp,fdi,it,specific
0,1996,East China,Anhui,2093.3,50661,631930,147002.0
1,1997,East China,Anhui,2347.32,43443,657860,151981.0
2,1998,East China,Anhui,2542.96,27673,889463,174930.0
3,1999,East China,Anhui,2712.34,26131,1227364,285324.0
4,2000,East China,Anhui,2902.09,31847,1499110,195580.0


In [15]:
client.id

'Client-4c6edbde-0e23-11eb-a5c8-4b14c8b4f4db'

## Selecting Features and Target

In [16]:
feat_list = ["year", "fdi"]
cat_feat_list = ["region", "province"]
target = ["gdp"]

In [17]:
ddf["year"] = ddf["year"].astype(int)
ddf["fdi"] = ddf["fdi"].astype(float)
ddf["gdp"] = ddf["gdp"].astype(float)
ddf["it"] = ddf["it"].astype(float)

In [18]:
#OHE
from dask_ml.preprocessing import OneHotEncoder

In [19]:
ddf = ddf.categorize(cat_feat_list)

In [20]:
ohe = OneHotEncoder(sparse=False)

In [21]:
ohe_ddf = ohe.fit_transform(ddf[cat_feat_list])

In [23]:
feat_list = feat_list + ohe_ddf.columns.tolist()
feat_list = [f for f in feat_list if f not in cat_feat_list]

In [24]:
ddf_processed = (dd.concat([ddf,ohe_ddf], axis=1) [feat_list + target])

In [25]:
ddf_processed.compute()

Unnamed: 0,year,fdi,region_East China,region_North China,region_Southwest China,region_Northwest China,region_South Central China,region_Northeast China,province_Anhui,province_Beijing,...,province_Shandong,province_Shanghai,province_Shanxi,province_Sichuan,province_Tianjin,province_Tibet,province_Xinjiang,province_Yunnan,province_Zhejiang,gdp
0,1996,50661.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2093.30
1,1997,43443.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2347.32
2,1998,27673.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2542.96
3,1999,26131.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2712.34
4,2000,31847.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2902.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,2003,498055.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9705.02
356,2004,668128.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,11648.70
357,2005,772000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,13417.68
358,2006,888935.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,15718.47


In [26]:
feat_list

['year',
 'fdi',
 'region_East China',
 'region_North China',
 'region_Southwest China',
 'region_Northwest China',
 'region_South Central China',
 'region_Northeast China',
 'province_Anhui',
 'province_Beijing',
 'province_Chongqing',
 'province_Fujian',
 'province_Gansu',
 'province_Guangdong',
 'province_Guangxi',
 'province_Guizhou',
 'province_Hainan',
 'province_Hebei',
 'province_Heilongjiang',
 'province_Henan',
 'province_Hubei',
 'province_Hunan',
 'province_Jiangsu',
 'province_Jiangxi',
 'province_Jilin',
 'province_Liaoning',
 'province_Ningxia',
 'province_Qinghai',
 'province_Shaanxi',
 'province_Shandong',
 'province_Shanghai',
 'province_Shanxi',
 'province_Sichuan',
 'province_Tianjin',
 'province_Tibet',
 'province_Xinjiang',
 'province_Yunnan',
 'province_Zhejiang',
 'region_East China',
 'region_North China',
 'region_Southwest China',
 'region_Northwest China',
 'region_South Central China',
 'region_Northeast China',
 'province_Anhui',
 'province_Beijing',
 'pro

## Dask Linear Regression

In [29]:
X=ddf_processed[feat_list].persist()
y=ddf_processed[target].persist()

In [34]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression, Ridge

In [50]:
X

Unnamed: 0_level_0,year,fdi,region_East China,region_East China,region_North China,region_North China,region_Southwest China,region_Southwest China,region_Northwest China,region_Northwest China,region_South Central China,region_South Central China,region_Northeast China,region_Northeast China,province_Anhui,province_Anhui,province_Beijing,province_Beijing,province_Chongqing,province_Chongqing,province_Fujian,province_Fujian,province_Gansu,province_Gansu,province_Guangdong,province_Guangdong,province_Guangxi,province_Guangxi,province_Guizhou,province_Guizhou,province_Hainan,province_Hainan,province_Hebei,province_Hebei,province_Heilongjiang,province_Heilongjiang,province_Henan,province_Henan,province_Hubei,province_Hubei,province_Hunan,province_Hunan,province_Jiangsu,province_Jiangsu,province_Jiangxi,province_Jiangxi,province_Jilin,province_Jilin,province_Liaoning,province_Liaoning,province_Ningxia,province_Ningxia,province_Qinghai,province_Qinghai,province_Shaanxi,province_Shaanxi,province_Shandong,province_Shandong,province_Shanghai,province_Shanghai,province_Shanxi,province_Shanxi,province_Sichuan,province_Sichuan,province_Tianjin,province_Tianjin,province_Tibet,province_Tibet,province_Xinjiang,province_Xinjiang,province_Yunnan,province_Yunnan,province_Zhejiang,province_Zhejiang,region_East China,region_East China,region_North China,region_North China,region_Southwest China,region_Southwest China,region_Northwest China,region_Northwest China,region_South Central China,region_South Central China,region_Northeast China,region_Northeast China,province_Anhui,province_Anhui,province_Beijing,province_Beijing,province_Chongqing,province_Chongqing,province_Fujian,province_Fujian,province_Gansu,province_Gansu,province_Guangdong,province_Guangdong,province_Guangxi,province_Guangxi,province_Guizhou,province_Guizhou,province_Hainan,province_Hainan,province_Hebei,province_Hebei,province_Heilongjiang,province_Heilongjiang,province_Henan,province_Henan,province_Hubei,province_Hubei,province_Hunan,province_Hunan,province_Jiangsu,province_Jiangsu,province_Jiangxi,province_Jiangxi,province_Jilin,province_Jilin,province_Liaoning,province_Liaoning,province_Ningxia,province_Ningxia,province_Qinghai,province_Qinghai,province_Shaanxi,province_Shaanxi,province_Shandong,province_Shandong,province_Shanghai,province_Shanghai,province_Shanxi,province_Shanxi,province_Sichuan,province_Sichuan,province_Tianjin,province_Tianjin,province_Tibet,province_Tibet,province_Xinjiang,province_Xinjiang,province_Yunnan,province_Yunnan,province_Zhejiang,province_Zhejiang
npartitions=5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1
0,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
72,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [51]:
X.compute()

Unnamed: 0,year,fdi,region_East China,region_East China.1,region_North China,region_North China.1,region_Southwest China,region_Southwest China.1,region_Northwest China,region_Northwest China.1,...,province_Tianjin,province_Tianjin.1,province_Tibet,province_Tibet.1,province_Xinjiang,province_Xinjiang.1,province_Yunnan,province_Yunnan.1,province_Zhejiang,province_Zhejiang.1
0,1996,50661.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1997,43443.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1998,27673.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1999,26131.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2000,31847.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,2003,498055.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
356,2004,668128.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
357,2005,772000.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
358,2006,888935.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [31]:
y

Unnamed: 0_level_0,gdp
npartitions=5,Unnamed: 1_level_1
0,float64
72,...
...,...
288,...
359,...


In [42]:
LinReg = LinearRegression()

In [43]:
LinReg.fit(X, y)

LinearRegression()

In [44]:
RidgeReg = Ridge()
RidgeReg.fit(x, y)

Ridge()

In [46]:
LinReg.predict(x)[:5]

array([[1830.87851079],
       [2076.99855135],
       [2220.28956053],
       [2534.65768132],
       [2936.29581027]])

In [45]:
RidgeReg.predict(x)[:5]

array([[1804.41754025],
       [2053.19939587],
       [2200.05297844],
       [2516.48507702],
       [2919.42271884]])

In [47]:
client.restart()

0,1
Client  Scheduler: inproc://192.168.1.71/9672/30  Dashboard: http://192.168.1.71:46451/status,Cluster  Workers: 3  Cores: 6  Memory: 12.00 GB


In [48]:
client.close()