# Task 1: Create a module similar to `sklearn's` `datasets` module

Create a class called `my_sklearn`, which accepts following parameters

* path: str (path to the csv file)
* is_header: boolean (if the file has header)
* target_variable: str (target variable)
* feature_names: list (Not required if the file already has headers)
* random_state: int (Optional)

Define following methods.

*  ** `Representation` in the format: **
    
    `Path: path/to/the/file.csv`
    
    `feature variables: ['your', 'list', 'of', 'features', 'here']`

    `target variable: your_target_variable `
     

*  **`load_data()` with following parameters:**

    * feature_subset: list of features to be selected (Optional)
    * train_size: float, fraction [0, 1] of data to be selected as training set (Optional)
    * CV_subset= "train", "test" or "all"

**Note:** 

* You can use `pandas`, `numpy` and `sklearn`'s `train_test_split` libraries.
* Error handling

In [4]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

class my_sklearn():
    
    def __init__(self, path, is_header, target_variable, feature_names=None, random_state=7):
        
        self. path = path
        self.feature_names = feature_names
        self.target_variable = target_variable
        self.is_header = is_header
        self.names = None
        self.header = 0
        
        if not self.is_header:
            try:
                self.names = self.feature_names + [self.target_variable]
            except:
                raise AttributeError("Feature Names must be entered if the file does not have header.")
        
        self._df = pd.read_csv(path, header=self.header, names=self.names)
        
        if not self.feature_names:
            self.feature_names = list(self._df.columns)
            self.feature_names.remove(self.target_variable)
        
        self.data = None
        self.target = None
        self.feature_subset = None
        self.target_subset = None
        self.CV_subset = None
        self.random_state = random_state
        
    def __repr__(self):
        
        return "path : {}\nfeatures varaibles: {}\ntarget varaibles: {}".\
                format(self.path, self.feature_names, self.target_variable)
    
    def load_data(self, feature_subset=None, CV_subset="train", train_size=None):
        
        self.feature_subset = feature_subset
        self.CV_subset = CV_subset
        self.train_size = train_size
                
        ##########################
        if not self.feature_subset:
#             print "checked"
            self.feature_subset = self.feature_names
#             print(self.feature_subset)
        elif not isinstance(self.feature_subset, list):
            raise TypeError("Please provide a list for feature subset")
        else:
            for feature in self.feature_subset:
                if feature not in self.feature_names:
                    raise ValueError("{} not in features".format(feature))
        
        mask = (0<self.train_size) & (self.train_size<1)
        
        ##########################
        if self.CV_subset == "all":
            if self.train_size:
                raise ValueError("CV subset is 'all', train_size must be None")
            self.target = self._df[self.target_variable].as_matrix()
            self.data = self._df[self.feature_subset].as_matrix()
        
        elif (self.CV_subset == "train") | (self.CV_subset == "test"):
            
            if not isinstance(self.train_size, int) | isinstance(self.train_size, float):
                raise TypeError("Please provide train size between 0 and 1")
            elif not mask:
                raise ValueError("Please provide train size between 0 and 1")
            
            X_train, X_test, y_train, y_test = train_test_split(self._df[self.feature_subset], \
                                                            self._df[self.target_variable], \
                                                            train_size=self.train_size,
                                                            random_state=self.random_state)
            if self.CV_subset == "train":
                self.target = y_train.as_matrix()
                self.data = X_train.as_matrix()
            elif self.CV_subset == "test":
                self.target = y_test.as_matrix()
                self.data = X_test.as_matrix()
        else:
            raise ValueError("The value of cv_subset can only be 'train', 'test' or 'all'")



In [5]:
titanic = my_sklearn("data/titanic_train.csv", is_header=True, target_variable="Survived")

In [6]:
titanic

path : data/titanic_train.csv
features varaibles: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
target varaibles: Survived

In [7]:
titanic.load_data(CV_subset="train", train_size=0.8)

In [8]:
titanic.data

array([[206, 3, 'Strom, Miss. Telma Matilda', ..., 10.4625, 'G6', 'S'],
       [719, 3, 'McEvoy, Mr. Michael', ..., 15.5, nan, 'Q'],
       [836, 1, 'Compton, Miss. Sara Rebecca', ..., 83.1583, 'E49', 'C'],
       ..., 
       [538, 1, 'LeRoy, Miss. Bertha', ..., 106.425, nan, 'C'],
       [197, 3, 'Mernagh, Mr. Robert', ..., 7.75, nan, 'Q'],
       [176, 3, 'Klasen, Mr. Klas Albin', ..., 7.8542, nan, 'S']], dtype=object)

In [9]:
titanic.target

array([0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1,

In [10]:
titanic.target_variable

'Survived'

In [11]:
titanic.feature_names

['PassengerId',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']