### Introduction
Machine Learning - Software that can learn from examples & not by rules

* Supervised Machine Learning - Training with features & labels

* Unsupervised Machine Learning - Finding relationship among data. This doesn't reply on labled data

### Supervised Machine Learning
* Regression - Output is continues in nature. Predicting house price, predicting sales, cardekho.com
* Classification - Output is categorical or discrete . Mail spam or not, tumor cancerous or not

### Preprocessing Data

In [1]:
import numpy as np 
from sklearn import preprocessing 

In [2]:
input_data = np.array([[5.1, -2.9, 3.3], 
                       [-1.2, 7.8, -6.1], 
                       [3.9, 0.4, 2.1], 
                       [7.3, -9.9, -4.5]]) 

In [3]:
input_data

array([[ 5.1, -2.9,  3.3],
       [-1.2,  7.8, -6.1],
       [ 3.9,  0.4,  2.1],
       [ 7.3, -9.9, -4.5]])

### Binerization 
Convert into boolean values

### Mean removal
### Scaling
### Normalization 

In [14]:
d = preprocessing.Binarizer(threshold=5.1).transform(input_data)

In [15]:
input_data[1]

array([-1.2,  7.8, -6.1])

In [16]:
d[1,1]

1.0

In [17]:
d

array([[ 0.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  0.],
       [ 1.,  0.,  0.]])

### Mean removal
Remove mean from the feature vector

In [18]:
data_scaled = preprocessing.scale(input_data)

In [19]:
data_scaled

array([[ 0.42462551, -0.2748757 ,  1.13244172],
       [-1.59434861,  1.40579288, -1.18167831],
       [ 0.04005901,  0.24346134,  0.83702214],
       [ 1.12966409, -1.37437851, -0.78778554]])

In [20]:
input_data

array([[ 5.1, -2.9,  3.3],
       [-1.2,  7.8, -6.1],
       [ 3.9,  0.4,  2.1],
       [ 7.3, -9.9, -4.5]])

In [22]:
data_scaled.mean(axis=1)

array([ 0.42739717, -0.45674468,  0.37351416, -0.34416666])

In [24]:
input_data

array([[ 5.1, -2.9,  3.3],
       [-1.2,  7.8, -6.1],
       [ 3.9,  0.4,  2.1],
       [ 7.3, -9.9, -4.5]])

###  Scaling
* Min max scaling

In [36]:
min_max_scalar = preprocessing.MinMaxScaler(feature_range=(-5,-1))

In [37]:
min_max_scalar.fit_transform(input_data)

array([[-2.03529412, -3.4180791 , -1.        ],
       [-5.        , -1.        , -5.        ],
       [-2.6       , -2.67231638, -1.5106383 ],
       [-1.        , -5.        , -4.31914894]])

In [33]:
input_data

array([[ 5.1, -2.9,  3.3],
       [-1.2,  7.8, -6.1],
       [ 3.9,  0.4,  2.1],
       [ 7.3, -9.9, -4.5]])

### Normalization

In [38]:
preprocessing.normalize(input_data, norm='l1')

array([[ 0.45132743, -0.25663717,  0.2920354 ],
       [-0.0794702 ,  0.51655629, -0.40397351],
       [ 0.609375  ,  0.0625    ,  0.328125  ],
       [ 0.33640553, -0.4562212 , -0.20737327]])

In [39]:
preprocessing.normalize(input_data, norm='l2')

array([[ 0.75765788, -0.43082507,  0.49024922],
       [-0.12030718,  0.78199664, -0.61156148],
       [ 0.87690281,  0.08993875,  0.47217844],
       [ 0.55734935, -0.75585734, -0.34357152]])

## Label Encoding 

In [40]:
import numpy as np

In [47]:
input_labels = ['green', 'black', 'yellow', 'red', 'black', 'red', 'white'] 

In [48]:
encoder = preprocessing.LabelEncoder()

In [49]:
encoder.fit_transform(input_labels)

array([1, 0, 4, 2, 0, 2, 3], dtype=int64)

In [50]:
encoder.classes_

array(['black', 'green', 'red', 'white', 'yellow'],
      dtype='<U6')

In [51]:
encoder.transform(['black'])

array([0], dtype=int64)

In [None]:
input_labels

### Logistic Regression Classifier

In [1]:
import numpy as np 
from sklearn import linear_model  
import matplotlib.pyplot as plt

In [3]:
#X is two column data,
#y is class to which data belongs ( 4 possible classes )
X = np.array([[3.1, 7.2], [4, 6.7], [2.9, 8], [5.1, 4.5], [6, 5], [5.6, 5], [3.3, 0.4], [3.9, 0.9], [2.8, 1], [0.5, 3.4], [1, 4], [0.6, 4.9]]) 
y = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) 

In [4]:
#Create an empty model
lg = linear_model.LogisticRegression()

In [5]:
#Training the estimator
lg.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
#Predicting using the same model
lg.predict([[3.4, 7.7]])

array([0])

In [9]:
#Finding the probability of data class
lg.predict_proba([[3.4, 7.7]])

array([[ 0.63341943,  0.20146046,  0.00137949,  0.16374062]])

In [2]:
from sklearn.datasets import load_iris

In [3]:
iris = load_iris()

In [4]:
iris.data

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.5,  1.5,  0.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4

In [15]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [76]:
iris.target.shape

(150,)

In [5]:
lg = linear_model.LogisticRegression()

In [6]:
lg.fit(iris.data[:145], iris.target[:145])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
lg.predict([[ 5.9,  3. ,  5.1,  1.8]])

array([2])

In [9]:
lg.predict_proba([[ 5.9,  3. ,  5.1,  1.8]])

array([[ 0.00170585,  0.29899447,  0.69929968]])

In [79]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [16]:
lg.decision_function

<bound method LinearClassifierMixin.decision_function of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)>

### Naive Bayes classifier
* Based on Bayes Theorm
* Probabilty of outcome

In [26]:
import numpy as np 
import matplotlib.pyplot as plt 
import sklearn
from sklearn.naive_bayes import GaussianNB

In [18]:
data = np.loadtxt('data_multivar_nb.txt',delimiter=',')

In [19]:
data

array([[ 2.18,  0.57,  0.  ],
       [ 4.13,  5.12,  1.  ],
       [ 9.87,  1.95,  2.  ],
       ..., 
       [ 4.4 ,  6.75,  1.  ],
       [ 8.71,  3.38,  2.  ],
       [ 6.12, -1.8 ,  3.  ]])

In [20]:
classifier = GaussianNB()

In [92]:
classifier.fit(data[:,:-1],data[:,-1])

GaussianNB(priors=None)

In [95]:
y_pred = classifier.predict(data[:,:-1])

In [94]:
from sklearn.metrics import accuracy_score

In [96]:
accuracy_score(data[:,-1],y_pred)

0.99750000000000005

In [98]:
from sklearn import model_selection

In [102]:
np.round(classifier.predict_proba(data[:,:-1]),3)

array([[ 0.999,  0.   ,  0.   ,  0.001],
       [ 0.   ,  1.   ,  0.   ,  0.   ],
       [ 0.   ,  0.   ,  1.   ,  0.   ],
       ..., 
       [ 0.   ,  1.   ,  0.   ,  0.   ],
       [ 0.   ,  0.   ,  1.   ,  0.   ],
       [ 0.   ,  0.   ,  0.   ,  1.   ]])

In [105]:
np.round(classifier.predict_proba(data[0:1,:-1]),3)

array([[ 0.999,  0.   ,  0.   ,  0.001]])

In [106]:
np.round(classifier.predict(data[0:1,:-1]),3)

array([ 0.])

In [118]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(data[:,:-1], data[:,-1],test_size=0.5)

In [119]:
classifier.fit(X_train,y_train)

GaussianNB(priors=None)

In [120]:
y_pred = classifier.predict(X_test)

In [121]:
accuracy_score(y_test,y_pred)

0.995

In [10]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups()

In [11]:
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [126]:
data.target[0]

7

In [13]:
len(data.data)

11314

In [14]:
categories = ['rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

In [15]:
len(train.data)

5361

In [16]:
len(test.data)

3567

In [17]:
train.data[0]

'From: ching@bigwpi.WPI.EDU ("The Logistician")\nSubject: #77\'s?\nOrganization: Worcester Polytechnic Institute\nLines: 11\nDistribution: world\nNNTP-Posting-Host: bigwpi.wpi.edu\n\n\nI am in need of all of the players wearing #77 in the NHL.  I know now only\nof one, Ray Borque for the Bruins.  Any help would be greatly appreciated.\n\nThanx.\n\n-- \n------------------------THE LOGISTICIAN REIGNS SUPREME!!!----------------------\n|\t\t\t\t\t\t\t\t\t      |\n|   GO BLUE!!!   GO TIGERS!!!   GO PISTONS!!!   GO LIONS!!!   GO RED WINGS!!! |\n-------------------------------ching@wpi.wpi.edu-------------------------------\n'

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
tfidf_vec = TfidfVectorizer(stop_words='english')

In [23]:
data_tf = tfidf_vec.fit_transform(train.data)

In [24]:
data_tf.shape

(5361, 59850)

In [27]:
model = GaussianNB()

In [30]:
model.fit(data_tf.toarray(),train.target)

GaussianNB(priors=None)

In [34]:
d = tfidf_vec.transform(['good hockey save us'])
model.predict(d.toarray())

array([2], dtype=int64)

In [1]:
import numpy as np

In [2]:
import math

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

In [7]:
sigmoid(28888)

1.0

In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report 

In [2]:
# Define sample labels 
true_labels = [2, 0, 0, 2, 4, 4, 1, 0, 3, 3, 3] 
pred_labels = [2, 1, 0, 2, 4, 3, 1, 0, 1, 3, 3] 

In [3]:
confusion_mat = confusion_matrix(true_labels, pred_labels)

In [5]:
confusion_mat

array([[2, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 2, 0, 0],
       [0, 1, 0, 2, 0],
       [0, 0, 0, 1, 1]], dtype=int64)

### Support Vector Machine
* Classification Technique
* Creates a hyperplane (N-dim line) separating classes
* 

In [6]:
from sklearn.multiclass import OneVsOneClassifier

In [7]:
import pandas as pd

In [10]:
data = pd.read_csv('adult.data.txt',header=None)

In [12]:
data.columns = ['age','workclass','fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race','sex' ,'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'salary']

In [39]:
from sklearn.preprocessing import LabelEncoder 
le=LabelEncoder()

cat_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex','native-country', 'salary' ]

for col in cat_cols:
    data[col] = le.fit_transform(data[col])

In [18]:
from sklearn.svm import LinearSVC 
classifier = OneVsOneClassifier(LinearSVC(random_state=0)) 

In [20]:
data.iloc[:,:-1]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5
5,37,4,284582,12,14,2,4,5,4,0,0,0,40,39
6,49,4,160187,6,5,3,8,1,2,0,0,0,16,23
7,52,6,209642,11,9,2,4,0,4,1,0,0,45,39
8,31,4,45781,12,14,4,10,1,4,0,14084,0,50,39
9,42,4,159449,9,13,2,4,0,4,1,5178,0,40,39


In [21]:
data.iloc[:,-1]

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        1
8        1
9        1
10       1
11       1
12       0
13       0
14       1
15       0
16       0
17       0
18       0
19       1
20       1
21       0
22       0
23       0
24       0
25       1
26       0
27       1
28       0
29       0
        ..
32531    0
32532    1
32533    1
32534    0
32535    0
32536    1
32537    0
32538    1
32539    1
32540    0
32541    0
32542    0
32543    0
32544    0
32545    1
32546    0
32547    0
32548    0
32549    0
32550    0
32551    0
32552    0
32553    0
32554    1
32555    0
32556    0
32557    1
32558    0
32559    0
32560    1
Name: salary, Length: 32561, dtype: int64

In [22]:
classifier.fit(data.iloc[:,:-1], data.iloc[:,-1])

OneVsOneClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0),
          n_jobs=1)

In [26]:
db = { 'age':37, 'workclass': 'Private','fnlwgt' : 215646, 'education':'HS-grad', 'education-num':9, 'marital-status':'Never-married', 'occupation':'Handlers-cleaners', 'relationship':'Not-in-family', 'race':'White', 'sex':'Male' ,'capital-gain':0, 'capital-loss' :0, 'hours-per-week':40, 'native-country':'India'}

In [29]:
t = pd.DataFrame(db,index=[1])

In [34]:
from sklearn.preprocessing import LabelEncoder 
le=LabelEncoder()

In [32]:
pd.read_csv('adult.data.txt',header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [35]:
t.columns = ['age','workclass','fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race','sex' ,'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

In [40]:
cat_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex','native-country' ]

for col in cat_cols:
    t[col] = le.transform(t[col])

In [41]:
t

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
1,37,0,0,0,9,0,0,0,0,0,0,0,0,0


In [42]:
classifier.predict(t)

array([0], dtype=int64)

### Regression 
* Process of estimating the relationship between input & output.
* output is continues in nature

In [43]:
x = np.arange(1,1000,2)

In [44]:
y = 5*x + 2

In [45]:
x

array([  1,   3,   5,   7,   9,  11,  13,  15,  17,  19,  21,  23,  25,
        27,  29,  31,  33,  35,  37,  39,  41,  43,  45,  47,  49,  51,
        53,  55,  57,  59,  61,  63,  65,  67,  69,  71,  73,  75,  77,
        79,  81,  83,  85,  87,  89,  91,  93,  95,  97,  99, 101, 103,
       105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127, 129,
       131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155,
       157, 159, 161, 163, 165, 167, 169, 171, 173, 175, 177, 179, 181,
       183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207,
       209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233,
       235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, 257, 259,
       261, 263, 265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285,
       287, 289, 291, 293, 295, 297, 299, 301, 303, 305, 307, 309, 311,
       313, 315, 317, 319, 321, 323, 325, 327, 329, 331, 333, 335, 337,
       339, 341, 343, 345, 347, 349, 351, 353, 355, 357, 359, 36

In [46]:
y

array([   7,   17,   27,   37,   47,   57,   67,   77,   87,   97,  107,
        117,  127,  137,  147,  157,  167,  177,  187,  197,  207,  217,
        227,  237,  247,  257,  267,  277,  287,  297,  307,  317,  327,
        337,  347,  357,  367,  377,  387,  397,  407,  417,  427,  437,
        447,  457,  467,  477,  487,  497,  507,  517,  527,  537,  547,
        557,  567,  577,  587,  597,  607,  617,  627,  637,  647,  657,
        667,  677,  687,  697,  707,  717,  727,  737,  747,  757,  767,
        777,  787,  797,  807,  817,  827,  837,  847,  857,  867,  877,
        887,  897,  907,  917,  927,  937,  947,  957,  967,  977,  987,
        997, 1007, 1017, 1027, 1037, 1047, 1057, 1067, 1077, 1087, 1097,
       1107, 1117, 1127, 1137, 1147, 1157, 1167, 1177, 1187, 1197, 1207,
       1217, 1227, 1237, 1247, 1257, 1267, 1277, 1287, 1297, 1307, 1317,
       1327, 1337, 1347, 1357, 1367, 1377, 1387, 1397, 1407, 1417, 1427,
       1437, 1447, 1457, 1467, 1477, 1487, 1497, 15

In [47]:
from sklearn.linear_model import LinearRegression

In [48]:
lg = LinearRegression()

In [50]:
lg.fit(x[:,np.newaxis],y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [51]:
lg.coef_

array([ 5.])

In [52]:
lg.intercept_

1.9999999999986358

In [53]:
lg.predict([[1000]])

array([ 5002.])

In [54]:
y_new = 5*x*x + 3*x + 6

In [55]:
from sklearn.preprocessing import PolynomialFeatures

In [56]:
pf = PolynomialFeatures(2)

In [59]:
data = pf.fit_transform(x[:,np.newaxis])

In [60]:
lg.fit(data,y_new)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [61]:
lg.coef_

array([ 0.,  3.,  5.])

In [62]:
lg.intercept_

6.0000000016298145

In [1]:
import numpy as np