In [9]:
import pandas as pd
import matplotlib.pyplot as plt
columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year", "origin", "car name"]
cars = pd.read_table("./data/data_auto-mpg.data", delim_whitespace=True, names=columns)
print(cars.head(5))
print(cars.tail(5))

    mpg  cylinders  displacement horsepower  weight  acceleration  year  \
0  18.0          8         307.0      130.0  3504.0          12.0    70   
1  15.0          8         350.0      165.0  3693.0          11.5    70   
2  18.0          8         318.0      150.0  3436.0          11.0    70   
3  16.0          8         304.0      150.0  3433.0          12.0    70   
4  17.0          8         302.0      140.0  3449.0          10.5    70   

   origin                   car name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  
      mpg  cylinders  displacement horsepower  weight  acceleration  year  \
393  27.0          4         140.0      86.00  2790.0          15.6    82   
394  44.0          4          97.0      52.00  2130.0          24.6    82   
395  32.0          4         135.0      84.00  2295.0          11.6    82   
396  28

In [10]:
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl") # 将 数值型特征 转换为 true/false型dummy特征
#print dummy_cylinders
cars = pd.concat([cars, dummy_cylinders], axis=1)
print(cars.head())
dummy_years = pd.get_dummies(cars["year"], prefix="year")
#print dummy_years
cars = pd.concat([cars, dummy_years], axis=1)
cars = cars.drop("year", axis=1)
cars = cars.drop("cylinders", axis=1)
print(cars.head())

    mpg  cylinders  displacement horsepower  weight  acceleration  year  \
0  18.0          8         307.0      130.0  3504.0          12.0    70   
1  15.0          8         350.0      165.0  3693.0          11.5    70   
2  18.0          8         318.0      150.0  3436.0          11.0    70   
3  16.0          8         304.0      150.0  3433.0          12.0    70   
4  17.0          8         302.0      140.0  3449.0          10.5    70   

   origin                   car name  cyl_3  cyl_4  cyl_5  cyl_6  cyl_8  
0       1  chevrolet chevelle malibu      0      0      0      0      1  
1       1          buick skylark 320      0      0      0      0      1  
2       1         plymouth satellite      0      0      0      0      1  
3       1              amc rebel sst      0      0      0      0      1  
4       1                ford torino      0      0      0      0      1  
    mpg  displacement horsepower  weight  acceleration  origin  \
0  18.0         307.0      130.0  3504.

In [11]:
import numpy as np
shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]
highest_train_row = int(cars.shape[0] * .70)
train = shuffled_cars.iloc[0:highest_train_row]
test = shuffled_cars.iloc[highest_train_row:]

In [12]:
from sklearn.linear_model import LogisticRegression

unique_origins = cars["origin"].unique()
unique_origins.sort()

models = {}
features = [c for c in train.columns if c.startswith("cyl") or c.startswith("year")]

for origin in unique_origins:
    model = LogisticRegression()
    
    X_train = train[features]
    y_train = train["origin"] == origin

    model.fit(X_train, y_train)
    models[origin] = model

In [13]:
testing_probs = pd.DataFrame(columns=unique_origins)  
print(testing_probs)

for origin in unique_origins:
    # Select testing features.
    X_test = test[features]   
    # Compute probability of observation being in the origin.
    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1]
print(testing_probs)

Empty DataFrame
Columns: [1, 2, 3]
Index: []
            1         2         3
0    0.939137  0.032251  0.058107
1    0.544206  0.258212  0.189678
2    0.967978  0.029988  0.022927
3    0.497805  0.417296  0.102453
4    0.497805  0.417296  0.102453
5    0.352692  0.280506  0.349260
6    0.967978  0.029988  0.022927
7    0.965727  0.022087  0.035465
8    0.947058  0.053702  0.019619
9    0.984075  0.015510  0.019259
10   0.951305  0.038694  0.027580
11   0.352692  0.280506  0.349260
12   0.226923  0.307289  0.469201
13   0.833010  0.062675  0.106111
14   0.833010  0.062675  0.106111
15   0.352692  0.280506  0.349260
16   0.226923  0.307289  0.469201
17   0.876154  0.063490  0.071784
18   0.368719  0.405842  0.218576
19   0.947058  0.053702  0.019619
20   0.278496  0.411223  0.310540
21   0.951305  0.038694  0.027580
22   0.980880  0.031395  0.009485
23   0.640798  0.134809  0.238217
24   0.965907  0.031054  0.024025
25   0.640798  0.134809  0.238217
26   0.876154  0.063490  0.071784
27 