In [106]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

In [253]:

#### obtain cols of XX type
def obtain_x(train_df,xtype):
    dtype_df = train_df.dtypes.reset_index()
    dtype_df.columns = ['col','type']
    return dtype_df[dtype_df.type==xtype].col.values

def load_data():
    raw_train_df=pd.read_excel("train.xlsx")
    raw_test_df=pd.read_excel("testA.xlsx")
    return raw_train_df,raw_test_df
def drop_missing_columns(train_df,num):
    col_missing_df = train_df.isnull().sum(axis=0).reset_index()
    col_missing_df.columns = ['col','missing_count']
    col_missing_df = col_missing_df.sort_values(by='missing_count')
    all_nan_columns = col_missing_df[col_missing_df.missing_count>num].col.values.tolist()
    train_df.drop(all_nan_columns,axis=1,inplace=True)
    return train_df
def split_columns(train_df):
    textcolumns=[]
    columns=train_df.columns.tolist()
    for item in columns:
        if item[0:1] not in ['0','1','2','3','4','5','6','7','8','9']:
            textcolumns.append(item)
    textcolumns.remove("Y")
    #textcolumns.remove("ID")
    numcolumns=[col for col in columns if col not in textcolumns]
    return numcolumns,textcolumns
def Remove_duplicate_columns(train_df):
    return train_df.T.drop_duplicates().T
def drop_columns_all_equal(df):
    float_uniq_col = []
    for col in df.columns.values:
        uniq = df[col].unique()
        if len(uniq) == 1:
            float_uniq_col.append(col)
    df.drop(float_uniq_col,axis=1,inplace=True)
    return df
def date_cols(train_df):
    float_date_col = []
    for col in train_df.columns.tolist():
        if train_df[col].min() > 1e13:
            float_date_col.append(col)
    return float_date_col  
def cal_corrcoef(float_df,y_train):
    float_col=float_df.columns.tolist()
    corr_values = []
    for col in float_col:
        corr_values.append(abs(np.corrcoef(float_df[col].values,y_train)[0,1]))
    corr_df = pd.DataFrame({'col':float_col,'corr_value':corr_values})
    corr_df = corr_df.sort_values(by='corr_value',ascending=False)
    return corr_df
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown
    def fit(self, X, y=None):
        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)
        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)
        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")
        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape
        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]
        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]
        return self
    def transform(self, X):
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])
        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)
        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [254]:
#read the data
raw_train_df,raw_test_df=load_data()

raw_train_df.drop("ID",axis=1,inplace=True)
raw_test_df.drop("ID",axis=1,inplace=True)

In [255]:
def process_train_data(train_df):
    float_col=obtain_x(train_df,'float64')
    train_df=train_df[float_col].copy()
    train_df = drop_missing_columns(train_df,50)
    numcolumns,textcolumns=split_columns(train_df)
    num_df=Remove_duplicate_columns(train_df[numcolumns])
    num_df=drop_columns_all_equal(num_df)
    datecol=date_cols(num_df)
    num_df.drop(datecol,axis=1,inplace=True)
    y_train = num_df.Y.values
    num_df.drop("Y",axis=1,inplace=True)
    corr_df = cal_corrcoef(num_df,train_labels)
    corr02 = corr_df[corr_df.corr_value>=0.25]
    corr02_col = corr02['col'].values.tolist()
    num_df = num_df[corr02_col]
    return num_df

In [256]:
train_df=process_train_data(raw_train_df)
num_attribs=train_df.columns.tolist()
numcolumns,textcolumns=split_columns(raw_train_df)
text_attribs=textcolumns
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(text_attribs)),
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
    ])
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])



In [265]:
train_prepared=full_pipeline.fit_transform(raw_train_df)
train_labels=raw_train_df.Y.values

In [266]:
lin_reg = LinearRegression()
lin_reg.fit(train_prepared, train_labels)
lin_scores = cross_val_score(lin_reg, train_prepared, train_labels,scoring="neg_mean_squared_error", cv=10)
np.mean(lin_scores)

-0.03448147153189203

In [233]:
test_prepared=full_pipeline.transform(raw_test_df)

ValueError: Found unknown categories [206] in column 5 during transform