## Notes on Pandas & Numpy

### Data creation and operations

jupyter notebook --notebook-dir=C:\MyFolder\Local_Docs\Trainings\MachineLearning

In [10]:
import pandas  as pd
import numpy as np

In [None]:
# Import data
train_data = pd.read_csv('../input/train.csv', index_col='Id')
test_data = pd.read_csv('../input/test.csv', index_col='Id')
print('_'*40)

In [None]:
# Create training and validation splits
df_train = red_wine.sample(frac=0.7, random_state=0)
df_valid = red_wine.drop(df_train.index) # Drop all indexes used in df_train

In [None]:
# Separate the dataset as response variable and feature variabes
X = wine.drop('quality', axis = 1)
y = wine['quality']

# ... OR
X = wine.copy()
y = X.pop("price")

#Train and Test splitting of data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# Add/remove a column
df['new'] = np.random.random(5)
df.drop('new', axis=1, inplace=True)

# Add/remove rows
df.loc[5,:] = ['Jack', 3, 3, 4, 5, 1] # Sixth row
df.drop(5, axis=0, inplace=True)

# Insert a new column in a specified position
df.insert(0, 'new', np.random.random(5) # New row in a specified position (i.e. position 0)

In [None]:
# Scale data to [0, 1]
max_ = df_train.max(axis=0)
min_ = df_train.min(axis=0)
df_train = (df_train - min_) / (max_ - min_)
df_valid = (df_valid - min_) / (max_ - min_)

In [None]:
# List
my_list = []

# Create a list from existing data
col = list(train_data.columns)

# Dictionary
my_dict = {}

In [30]:
# Create a Series with a timestamp index (launched)
launched = pd.Series(ks.index, index=ks.launched, name="count_7_days").sort_index()

# Example 
d = {'a': 1, 'b': 2, 'c': 3}
ser = pd.Series(data=d, index=['a', 'b', 'c'])

>>> ser
a   1
b   2
c   3
dtype: int64
# Example end

# Creates a rolling window that contains all the data in the previous 7 days and counts the projects
count_7_days = launched.rolling('7d').count() - 1
# Adjust the index so we can join it with the other training data
count_7_days.index = launched.values
count_7_days = count_7_days.reindex(ks.index)
# Join
baseline_data.join(count_7_days)

def time_since_last_project(series):
    # Return the time in hours
    return series.diff().dt.total_seconds() / 3600  # Difference with previous row

df = ks[['category', 'launched']].sort_values('launched')
timedeltas = df.groupby('category').transform(time_since_last_project)

# Final time since last project
timedeltas = timedeltas.fillna(timedeltas.median()).reindex(baseline_data.index)

# Create series
dataser = pd.Series([30, 35, 40], index=['2015 Sales', '2016 Sales', '2017 Sales'], name='Product A')
dataser

2015 Sales    30
2016 Sales    35
2017 Sales    40
Name: Product A, dtype: int64

In [31]:
# Create data-frame
datafr = pd.DataFrame({   'Bob': ['I liked it.', 'It was awful.'], 
                          'Sue': ['Pretty good.', 'Bland.']},
                           index=['Product A', 'Product B'])

datafr

Unnamed: 0,Bob,Sue
Product A,I liked it.,Pretty good.
Product B,It was awful.,Bland.


### Indexing

In [None]:
# Get indexes
indices = np.where([earthquakes.Date.str.len() > 24])[1] # Index of 1st column where length of date > 10

In [24]:
# Indexing - Equivalent forms

#datafr['Bob']
#datafr.iloc[:,0]
#datafr.loc[:, 'Bob']
datafr.Bob

Product A      I liked it.
Product B    It was awful.
Name: Bob, dtype: object

In [None]:
# Indexing
reviews.loc[(reviews.country == 'Italy') & (reviews.points >= 90)]

In [None]:
# loc is primarily label based indexing. Integers may be used but they are interpreted as a label.
# iloc is primarily integer based indexing
# To select a subset of rows and columns from our DataFrame, we can use the iloc method. For example, we can select month, day and year (columns 2, 3 and 4 if we start counting at 1), like this:

# iloc[row slicing, column slicing]
surveys_df.iloc[0:3, 1:4]

In [None]:
# Make "title" column the index of the data frame
import pandas as pd
reviews.set_index("title")

In [None]:
# Conditioning
top_oceania_wines = reviews.loc[reviews.country.isin(['Australia', 'New Zealand']) & (reviews.points >= 95)]

In [None]:
# Pandas methods
reviews.points.describe()
reviews.points.mean()
reviews.taster_name.unique()
reviews.taster_name.value_counts()

# Index max of ratio points/price
bargain_wine = reviews.title[(reviews.points/reviews.price).idxmax()]

In [None]:
# Table Class-Mean(Survived)
train_data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)


  |Pclass|Survived|
-------------------
0    1    0.629630
1    2    0.472826
2    3    0.242363
 

In [None]:
# Columns names
X_data.columns

# Selecting a column
y = X_data.Price

#Selecting more columns
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = X_data[melbourne_features]

In [None]:
# Copy dataframe avoiding original data changes
X1 = X.copy()

In [None]:
# Percentage missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
print(missing_data)

### Models

In [None]:
# Random forest
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)

# Models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [None]:
# Decision tree
from sklearn.tree import DecisionTreeRegressor

my_model = DecisionTreeRegressor(max_leaf_nodes=max_n_leaf, random_state=1)

In [None]:
# Fit model
my_model.fit(X, y)

# Predict
my_model.predict(X_test)

In [5]:
# Mean absoute error
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(val_y, preds_val)

In [None]:
# XGBoost 
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

my_model = XGBRegressor(n_estimators=500, learning_rate=0.05, n_jobs=4) # n_estimators [100-1000], n_jobs = n of PC cores

my_model.fit(X_train, y_train, 
             early_stopping_rounds=5,             # stops after 5 deteriorating rounds
             eval_set=[(X_valid, y_valid)],       # validation score data
             verbose=False)

predictions = my_model.predict(X_valid)

print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

### Missing Values

In [None]:
# Mark all missing values
dataset.replace('?', nan, inplace=True)

# Remove all the rows that contain a missing value
data_cleaned = nfl_data.dropna()

# Remove all columns with at least one missing value
data_cleaned = nfl_data.dropna(axis=1)

# Replace all NA's with 0
subset_nfl_data.fillna(0)

# Replace all NA's the value that comes directly after it in the same column, then replace all the remaining na's with 0
subset_nfl_data.fillna(method='bfill', axis=0).fillna(0)

# Fill missing values with the value of the previous day
def fill_missing(df):
    one_day = 60 * 24
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            if isnan(df[row, col]):
                df[row, col] = df[row - one_day, col]


### 1 - Numerical data

In [None]:
# Remove rows with missing target ('SaleProce'), separate target from predictors
X1.dropna(axis=0, subset=['SalePrice'], inplace=True)
# Or
# Remove live projects rows
ks = ks.query('state != "live"')

# Copy target
y = X1.SalePrice

# Drop 'SalePrice' column
X1.drop(['SalePrice'], axis=1, inplace=True)

In [None]:
# Remove non-numerical (categorical) predictors
X2 = X1.select_dtypes(exclude=['object'])

In [None]:
# Print columns with missing values > 0
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

#### Drop columns

In [None]:
# Get missing values columns name
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

# Drop missing values columns
reduced_X_train = X_train.drop(cols_with_missing, axis=1)

#### Impute Values

In [None]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()

imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))  # Calculates mean and fits. New data frame has no column names
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))      # Fits using previously calculated mean

# Include column names
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns


In [None]:
# Fill missing feature with median
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

### 2 - Categorical Data

In [None]:
# Get list of categorical variables columns
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

# .. alternatively
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# .. alternatively + low cardinality condition
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if
                  X_train_full[cname].dtype in ['int64', 'float64']]

#### Label Removal

In [None]:
# Remove non-numerical predictors
X1 = X_train.select_dtypes(exclude=['object'])

#### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()

for col in object_cols:
    label_X_train[object_cols] = label_encoder.fit_transform(X_train[object_cols])
    label_X_valid[object_cols] = label_encoder.transform(X_valid[object_cols])
    

# ... other example
cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()

# Apply the label encoder to each column
encoded = ks[cat_features].apply(encoder.fit_transform)

# Since ks and encoded have the same index and I can easily join them
data = ks[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(encoded)  

# ... or with column composition
interactions = ks['category'] + "_" + ks['country'] # Poetry_GB

label_enc = LabelEncoder()
data_interaction = baseline_data.assign(category_country=label_enc.fit_transform(interactions))

In [None]:
# ... OR

# Label encoding for categoricals
for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()
    
    
# Factorize - Example
codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])

>>> codes
array([0, 0, 1, 2, 0]...)

>>> uniques
array(['b', 'a', 'c'], dtype=object)


#### Count Encoding

In [None]:
import category_encoders as ce
cat_features = ['category', 'currency', 'country']

# Create the count encoder
count_enc = ce.CountEncoder(cols=cat_features)

# Learn encoding from the training set
count_enc.fit(train[cat_features])

# Apply encoding to the train and validation sets
train_encoded = train.join(count_enc.transform(train[cat_features]).add_suffix('_count'))
valid_encoded = valid.join(count_enc.transform(valid[cat_features]).add_suffix('_count'))

#### Target Encoding

In [None]:
import category_encoders as ce

# Target encoding replaces a categorical value with the average value of the target for that value of the feature.
target_enc = ce.TargetEncoder(cols=cat_features)
target_enc.fit(train[cat_features], train['outcome'])     # -> only train data used to avoid leakage

# Transform the features, rename the columns with _target suffix, and join to dataframe
train_TE = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid_TE = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))

#### CatBoost Encoding

In [None]:
import category_encoders as ce

# Similar to target encoding in that it's based on the target probablity for a given value. 
# However with CatBoost, for each row, the target probability is calculated only from the rows before it.
target_enc = ce.CatBoostEncoder(cols=cat_features)
target_enc.fit(train[cat_features], train['outcome'])  # -> only train data used to avoid leakage

# Transform the features, rename columns with _cb suffix, and join to dataframe
train_CBE = train.join(target_enc.transform(train[cat_features]).add_suffix('_cb'))
valid_CBE = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_cb'))

#### Label Mapping

In [None]:
# Like "label Encoding". This is done by specifying explicitly the encoding i.e. {'female': 1, 'male': 0} on single columns
dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

#### Label One-hot Encoding

In [None]:
# One-hot Encoding
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Concatenate one-hot encoded columns with numerical ones
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [None]:
# .. or one-hot encoding with Pandas

X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1) # X_train aligned according to X_valid
X_train, X_test = X_train.align(X_test, join='left', axis=1)   # X_train aligned according to X_test

In [None]:
# Remove bad categorical columns

# Collect categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if set(X_train[col]) == set(X_valid[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))

In [7]:
# Low cardinality columns
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

In [None]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

In [None]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

### Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean') # Your code here

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')), # For categorical data with 'costant' -> default is “missing_value”
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0) # Your code here

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),  # "make_pipeline" generates names for steps automatically
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

### Features Creation

In [None]:
# Cat features
import itertools

cat_features = ['ip', 'app', 'device', 'os', 'channel']
interactions = pd.DataFrame(index=clicks.index)

for col1, col2 in itertools.combinations(cat_features, 2):
    new_col_name = '_'.join([col1, col2])

    # Convert to strings and combine
    new_values = clicks[col1].map(str) + "_" + clicks[col2].map(str)

    encoder = preprocessing.LabelEncoder()
    interactions[new_col_name] = encoder.fit_transform(new_values)

clicks = clicks.join(interactions)

### Univariate Features Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

feature_cols = baseline_data.columns.drop('outcome')
train, valid, _ = get_data_splits(baseline_data)

# Keep 5 features
selector = SelectKBest(f_classif, k=5)

X_new = selector.fit_transform(train[feature_cols], train['outcome'])

# Get back the features we've kept, zero out all other features
selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=train.index, columns=feature_cols)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns = selected_features.columns[selected_features.var() != 0]

### L1 Regularization (Lasso)

In [None]:
# Univariate methods consider only one feature at a time when making a selection decision. 
# Instead, we can make our selection using all of the features by including them in a linear model 
# with L1 regularization.

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

train, valid, _ = get_data_splits(baseline_data)

X, y = train[train.columns.drop("outcome")], train['outcome']

# Set the regularization parameter C=1
logistic = LogisticRegression(C=1, penalty="l1", solver='liblinear', random_state=7).fit(X, y)
model = SelectFromModel(logistic, prefit=True)

X_new = model.transform(X)

# Get back the kept features as a DataFrame with dropped columns as all 0s
selected_features = pd.DataFrame(model.inverse_transform(X_new), index=X.index, columns=X.columns)

# Dropped columns have values of all 0s, keep other columns 
selected_columns = selected_features.columns[selected_features.var() != 0]

# Feature selection with L1 regularization is more powerful the univariate tests, but it can also be 
# very slow when you have a lot of data and a lot of features. Univariate tests will be much faster.

### Cross-validation

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', RandomForestRegressor(n_estimators=50,random_state=0))  ])

In [None]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error') # cv -> number of folds

### LightGBM

In [None]:
import lightgbm as lgb
from sklearn import metrics

# Split data
valid_fraction = 0.1
clicks_srt = clicks.sort_values('click_time')
valid_rows = int(len(clicks_srt) * valid_fraction)
train = clicks_srt[:-valid_rows * 2]

# valid size == test size, last two sections of the data
valid = clicks_srt[-valid_rows * 2:-valid_rows]
test = clicks_srt[-valid_rows:]

# Train data
dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
dtest = lgb.Dataset(test[feature_cols], label=test['is_attributed'])

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10)

# Predict on test set
ypred = bst.predict(test[feature_cols])
score = metrics.roc_auc_score(test['is_attributed'], ypred)

### Time Calculation

In [None]:
t0 = time.time()
ser.fillna(value=ser.mode()[0])
print('Pandas Time Elapsed:', time.time()-t0)

### Trasformations

In [None]:
# Log transformation
df_train['SalePrice'] = np.log(df_train['SalePrice'])

### Dummy variables

In [None]:
s = pd.Series(list('abca'))
pd.get_dummies(s)

#    a  b  c
# 0  1  0  0
# 1  0  1  0
# 2  0  0  1
# 3  1  0  0

### Miscellanea

In [32]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.arange(12).reshape(3,4),columns=['A', 'B', 'C', 'D'])
print(df)

# Axis 
print('---------')
print(df.mean(axis=1)) # 0: along rows (picking up a column at time), 1: along columns (picking up a row at time)
print('---------')

df = df.drop('A', axis=1)
print(df)

   A  B   C   D
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
---------
A    4.0
B    5.0
C    6.0
D    7.0
dtype: float64
---------
   B   C   D
0  1   2   3
1  5   6   7
2  9  10  11


In [None]:
# Extract Title
dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

# Replace
dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Dona'], 'Rare')
    
# Print survived mean grouped by title 
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

# Title mapping
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)              # fill na values

In [None]:
# Create 'Age' bands
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)  # -> cut: evenly spaced intervals, qcut: same qvalues (same number of occurrences)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

In [None]:
# Create 'IsAlone' column
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
# Create 'Age*Class' column
for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

In [None]:
# Resample data to daily
daily_groups = dataset.resample('D')
daily_data = daily_groups.sum()

In [1]:
doc_list = ["The Learn Python Challenge Casino.", "They bought a car and a casino", "Casinoville"]

keywords = ['casino', 'they']

def word_search(documents, keyword):
    # list to hold the indices of matching documents
    indices = [] 
    
    # Iterate through the indices (i) and elements (doc) of documents
    for i, doc in enumerate(documents):
        # Split the string doc into a list of words (according to whitespace)
        tokens = doc.split()
        
        # Make a transformed list where we 'normalize' each word to facilitate matching.
        # Periods and commas are removed from the end of each word, and it's set to all lowercase.
        normalized = [token.rstrip('.,').lower() for token in tokens]
        
        # Is there a match? If so, update the list of matching indices.
        if keyword.lower() in normalized:
            indices.append(i)
    return indices


dct = {}

for i, doc in enumerate(doc_list):
    doc = doc.split()
    print(doc)
    for word in keywords:
        index = word_search(doc, word)
        dct[word] = index
        print(index)


['The', 'Learn', 'Python', 'Challenge', 'Casino.']
[4]
[]
['They', 'bought', 'a', 'car', 'and', 'a', 'casino']
[6]
[0]
['Casinoville']
[]
[]


### Numpy

In [2]:
import numpy as np

rolls = np.random.randint(low=1, high=6, size=10)
print("Rolls as Numpy ndarray", rolls)

rolls_list = rolls.tolist()

print("Rolls as list",rolls_list)

xlist = [[1,2,3],[2,4,6],]
print("xlist as list",xlist)
# Create a 2-dimensional array
x = np.asarray(xlist)
print("xlist as ndarray\n",x)

print(x)

help(np.ravel)

Rolls as Numpy ndarray [4 1 5 2 4 3 2 4 3 4]
Rolls as list [4, 1, 5, 2, 4, 3, 2, 4, 3, 4]
xlist as list [[1, 2, 3], [2, 4, 6]]
xlist as ndarray
 [[1 2 3]
 [2 4 6]]
[[1 2 3]
 [2 4 6]]
Help on function ravel in module numpy:

ravel(a, order='C')
    Return a contiguous flattened array.
    
    A 1-D array, containing the elements of the input, is returned.  A copy is
    made only if needed.
    
    As of NumPy 1.10, the returned array will have the same type as the input
    array. (for example, a masked array will be returned for a masked array
    input)
    
    Parameters
    ----------
    a : array_like
        Input array.  The elements in `a` are read in the order specified by
        `order`, and packed as a 1-D array.
    order : {'C','F', 'A', 'K'}, optional
    
        The elements of `a` are read using this index order. 'C' means
        to index the elements in row-major, C-style order,
        with the last axis index changing fastest, back to the first
        axis in

In [3]:
x = '0'
x1 = int(x) + 2

print(x1)

s = str(x1)
print(s)

2
2


In [None]:
# Total cells
np.product(nfl_data.shape)

# Total missing values
nfl_data.isnull().sum().sum()

In [None]:
# Add outcome column, "successful" == 1, others are 0
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

# Assign time stamp columns -> See .dt (day-time) attribute
ks = ks.assign(hour=ks.launched.dt.hour,
               day=ks.launched.dt.day,
               month=ks.launched.dt.month,
               year=ks.launched.dt.year)

### Mapping

In [None]:
# Map 
review_points_mean = reviews.points.mean()
reviews.points.map(lambda p: p - review_points_mean)

# OR

def remean_points(row):
    row.points = row.points - review_points_mean
    return row

reviews.apply(remean_points, axis='columns')

############################################à

# Example: counts the number of words 'tropical', 'fruits' in description
cnt_tropical = reviews.description.map(lambda desc: "tropical" in desc).sum()
cnt_fruity   = reviews.description.map(lambda desc: "fruity" in desc).sum()

descriptor_counts = pd.Series([cnt_tropical, cnt_fruity], index=['tropical', 'fruity'])

### Grouping

In [None]:
# Count by groups
reviews.groupby('points').points.count()

# Min
reviews.groupby('points').price.min()

# Best wine by country and province
reviews.groupby(['country', 'province']).apply(lambda df: df.loc[df.points.idxmax()])

# Grouping with multiple functions
reviews.groupby(['country']).price.agg([len, min, max])

# To convert back to regular index 
countries_reviewed.reset_index()

### Sorting

In [None]:
reviews.sort_values(by='len', ascending=False)

# By more than 1 column
reviews.sort_values(by=['country', 'len'])
reviews.groupby(['country', 'variety']).size().sort_values(ascending=False)

# By index
countries_reviewed.sort_index()

### Data types

In [None]:
# To convert
reviews.points.astype('float64')

# To get all NaN countries
reviews[pd.isnull(reviews.country)]

# To fill NaN
reviews.region_2.fillna("Unknown")

# To replace firstname with secondname
reviews.taster_twitter_handle.replace("@firstname", "@secondname")

### Renaming

In [None]:
# To rename columns
reviews.rename(columns={'region_1': 'region', 'region_2': 'locale'})
reviews.rename(columns=dict(region_1='region', region_2='locale'))

In [None]:
# To rename rows
reviews.rename(index={0: 'firstEntry', 1: 'secondEntry'})

In [None]:
# To rename row and colmun indexes 
reviews.rename_axis("wines", axis='rows').rename_axis("fields", axis='columns')

### Combining

In [None]:
# Concat puts together DataFrame or Series with same fields
canadian_youtube = pd.read_csv("../input/youtube-new/CAvideos.csv")
british_youtube = pd.read_csv("../input/youtube-new/GBvideos.csv")

pd.concat([canadian_youtube, british_youtube])

# ... OR
pd.concat([df, df2], axis=0, ignore_index=True) # To combine along columns, the axis parameter is set to 1

In [None]:
# Join dataFrame objects which have an index in common
left = canadian_youtube.set_index(['title', 'trending_date'])
right = british_youtube.set_index(['title', 'trending_date'])

left.join(right, lsuffix='_CAN', rsuffix='_UK')

In [None]:
# Merge combines dataframes based on common values in a given column or columns
customer.merge(order, on='id')

In [None]:
# The pivot_table transforms a dataframe to a format that explains the relationship among variables
df.pivot_table(index='name', columns='ctg', aggfunc='mean'

### Scaling and Normalizing

In [None]:
from mlxtend.preprocessing import minmax_scaling

original_data = np.random.exponential(size=1000) # Data must be > 0 with BoxCox

# Mix-max scale the data between 0 and 1
scaled_data = minmax_scaling(original_data, columns=[0])

# Normalize your data if you're going to be using a machine learning or statistics technique that assumes 
# your data is normally distributed. Some examples of these include linear discriminant analysis (LDA) and 
# Gaussian naive Bayes. (Pro tip: any method with "Gaussian" in the name probably assumes normality.)

# Normalize the exponential data with boxcox
normalized_data = stats.boxcox(original_data)

### Dates and Time

In [None]:
# Create a new column, date_parsed, with the parsed dates
landslides['date_parsed'] = pd.to_datetime(landslides['date'], format="%m/%d/%Y")

# Get the day of the month from the date_parsed column
day_of_month_landslides = landslides['date_parsed'].dt.day

### Character Endcoding

In [11]:
import chardet

before = "This is the euro symbol: €"

after = before.encode("utf-8", errors="replace")
type(before)
type(after)
print(after.decode("utf-8"))

This is the euro symbol: €


In [None]:
# look at the first ten thousand bytes to guess the character encoding
with open("../input/ks-projects-201801.csv", 'rb') as rawdata: result = chardet.detect(rawdata.read(10000))

print(result) # Gives: {'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}

kickstarter_2016 = pd.read_csv("../input/kickstarter-projects/ks-projects-201612.csv", encoding='Windows-1252')

# save our file (will be saved as UTF-8 by default!)
kickstarter_2016.to_csv("ks-projects-201801-utf8.csv")

In [None]:
# Convert to lower case
professors['Country'] = professors['Country'].str.lower()

# Remove trailing white spaces
professors['Country'] = professors['Country'].str.strip()

In [None]:
import fuzzywuzzy
from fuzzywuzzy import process
import chardet

# Get the top 10 closest matches to "south korea"
matches = fuzzywuzzy.process.extract("south korea", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

### Normalize

In [1]:
from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler()
print(scaler.fit(data))
MinMaxScaler()
print(scaler.data_max_)
print(scaler.transform(data))

MinMaxScaler()
[ 1. 18.]
[[0.   0.  ]
 [0.25 0.25]
 [0.5  0.5 ]
 [1.   1.  ]]


In [None]:
# Removing the mean and scaling to unit variance to get otimized results
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)