In [1]:
# Dependencies
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression, make_sparse_uncorrelated

### All Correlated Dataset of 6 Features

From documentation:
- make_regression produces regression targets as an optionally-sparse random linear combination of random features, with noise. Its informative features may be uncorrelated, or low rank (few features account for most of the variance).

In [2]:
# create data set of 6 features that are significant to the regression
# excluding first 4 terms, all other terms are defaults; listed for clarity
X,y,coef = make_regression(n_samples=200, 
                           n_features=6, 
                           n_informative=6, 
                           coef=True,
                           n_targets=1, 
                           bias=0, 
                           effective_rank=None, 
                           noise=0, 
                           random_state=42)

In [3]:
coef

array([95.14033422, 63.31513756, 60.16118201, 81.91888594, 88.42064633,
        4.34125329])

In [4]:
print("X shape", X.shape)
print("y shape", y.shape)

X shape (200, 6)
y shape (200,)


In [5]:
print("X row shape", X[0].shape)

X row shape (6,)


In [6]:
print("y shape", (np.array([y]).T).shape)

y shape (200, 1)


In [7]:
data = np.concatenate((X,np.array([y]).T), axis=1)

In [8]:
df = pd.DataFrame(data, columns=['meaningful_1','meaningful_2','meaningful_3', 'meaningful_4', 'meaningful_5', 'meaningful_6', 'target'])
df.head()

Unnamed: 0,meaningful_1,meaningful_2,meaningful_3,meaningful_4,meaningful_5,meaningful_6,target
0,-1.236951,0.781823,-1.320457,0.521942,0.296985,0.259883,-77.478247
1,-0.241236,0.243339,0.352055,-1.251539,1.443765,-0.045586,38.571844
2,0.963376,-0.820682,0.412781,0.82206,1.896793,1.158596,304.615372
3,-0.240325,0.712998,-0.374821,0.71096,0.444263,-1.222128,91.946823
4,-0.024355,-0.5737,2.14227,1.727543,0.436324,0.522835,272.608712


In [10]:
df.to_csv('regression_data/all_correlated-6-features.cvs', index=False)

### 4 Correlated and 2 Uncorrelated Linear Features 

In [11]:
# create data set of 6 features with 4 significant and 2 useless
X,y,coef = make_regression(n_samples=200, 
                           n_features=6, 
                           n_informative=4, 
                           coef=True,
                           n_targets=1, 
                           bias=0, 
                           effective_rank=4, 
                           tail_strength=0.5,
                           noise=0, 
                           random_state=42)

In [12]:
data = np.concatenate((X,np.array([y]).T), axis=1)

In [13]:
df = pd.DataFrame(data, columns=['meaningful_1','meaningful_2','meaningful_3', 'meaningful_4', 'unrelated_1', 'unrelated_2', 'target'])
df.head()

Unnamed: 0,meaningful_1,meaningful_2,meaningful_3,meaningful_4,unrelated_1,unrelated_2,target
0,-0.11407,0.055827,0.049792,-0.014761,-0.027299,-0.007287,0.163435
1,-0.037493,-0.059537,0.009,0.005654,0.036818,0.057861,3.625903
2,-0.015487,0.062475,-0.032665,0.010558,-0.070207,-0.078827,-7.482244
3,0.026067,0.032296,-0.075725,0.013712,0.063807,-0.063561,4.828345
4,-0.009825,0.035543,-0.03379,-0.026442,-0.023761,0.13932,8.710039


In [14]:
df.to_csv('regression_data/4-correlated-6-features.cvs', index=False)

### 4 Correlated and 2 Uncorrelated Linear Features

From documentation
- Generate a random regression problem with sparse uncorrelated design.

In [15]:
# the first 4 coefficients are [0,2,-2,-1.5], the remaining are usesless.
X,y = make_sparse_uncorrelated(n_samples=200, 
                               n_features=6, 
                               random_state=42)

In [16]:
data = np.concatenate((X,np.array([y]).T), axis=1)

In [17]:
df = pd.DataFrame(data, columns=['meaningful_1','meaningful_2','meaningful_3', 'meaningful_4', 'unrelated_1', 'unrelated_2', 'target'])
df.head()

Unnamed: 0,meaningful_1,meaningful_2,meaningful_3,meaningful_4,unrelated_1,unrelated_2,target
0,0.496714,-0.138264,0.647689,1.52303,-0.234153,-0.234137,-3.234512
1,1.579213,0.767435,-0.469474,0.54256,-0.463418,-0.46573,2.809785
2,0.241962,-1.91328,-1.724918,-0.562288,-1.012831,0.314247,0.830966
3,-0.908024,-1.412304,1.465649,-0.225776,0.067528,-1.424748,-5.781967
4,-0.544383,0.110923,-1.150994,0.375698,-0.600639,-0.291694,1.464763


In [19]:
df.to_csv('regression_data/4_correlated-6-features-sparse.cvs', index=False)