# Analyze Data

## Summarize Data

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
from acquire_prepare import acquire_oil
from acquire_prepare import prep_data
import numpy as np
import pandas as pd

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import sys # used in command to make entire array print by default
import seaborn as sns
from scipy.stats import iqr

# Modeling
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split


from acquire_prepare import acquire_oil
from acquire_prepare import prep_data
from model import get_scaled_df
import xgboost as xgb
from explore import xgb_rank


In [2]:
df = acquire_oil()
df = prep_data(df,'midland')

### Data Structure

In [3]:
df.shape

(6164, 61)

## Visualize Data

In [4]:
# df.head(3).T

### Attribute Histograms

In [5]:
# ranked_features = xgb_rank(df, df.recovery)
# ranked_features

In [6]:
def xg_rank(df,target_variable,feature_percent=80,mode='gain'):
    # ''' pass it the dataframe and the target variable,  returns a sorted list 
    #      decending order 
    # '''
    scaled_df = get_scaled_df(df)  
    scaled_columns = scaled_df.columns.values.tolist()
    xgb_params = {'max_depth': 8,'seed' : 123}
    dtrain = xgb.DMatrix(scaled_df, target_variable, feature_names=scaled_df.columns.values)
    model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)
    importance_dict = model.get_score(importance_type=mode)
    sorted_importance_dict = sorted(importance_dict.items(), key=lambda kv: kv[1])
    importance_df = pd.DataFrame.from_dict(sorted_importance_dict)
    importance_df.columns = ['feature',mode]   
    importance_df.sort_values(mode, inplace = True) 
    importance_df['rank'] = importance_df[mode].rank(ascending = False)
    importance_df.sort_values('rank', inplace = True) 
    importance_df.set_index('rank', inplace = True)
    importance_df[mode] = importance_df[mode].apply(lambda x: round(x, 2))
    importance_df['cum_sum'] = round(importance_df[mode].cumsum(),2)
    importance_df['cum_perc'] = round(100*importance_df.cum_sum/importance_df[mode].sum(),2)
    importance_df.reset_index(inplace=True) 
    feature_list =[] 
    for i in range((importance_df.shape[0])): 
        feature_name = importance_df.iloc[i,1].replace('scaled_','')
        importance_df.iloc[i,1] = feature_name
        cum_percent = importance_df.iloc[i,4]
        if cum_percent > feature_percent:
            break
        else:
            feature_list.append(feature_name)
    return feature_list, importance_df
    

In [7]:
# plot the important features #
# fig, ax = plt.subplots(figsize=(12,8))
# xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
# plt.show()

In [8]:
ranked_features, rank_df = xgb_rank(df, df.recovery)
rank_df

Unnamed: 0,rank,feature,gain,cum_sum,cum_perc
0,1.0,oil_gravity,233693.05,233693.05,35.25
1,2.0,gross_perfs,102573.63,336266.68,50.72
2,3.0,vintage,60778.02,397044.7,59.89
3,4.0,mid_point_lat,47585.85,444630.55,67.07
4,5.0,formation,35792.27,480422.82,72.47
5,6.0,tvd,27356.87,507779.69,76.6
6,7.0,mid_point_long,26853.97,534633.66,80.65
7,8.0,scaled_frac_fluid_gpf,20685.47,555319.13,83.77
8,9.0,scaled_oper,18859.56,574178.69,86.61
9,10.0,scaled_gor_hist,17617.17,591795.86,89.27


In [9]:
ranked_features

['oil_gravity', 'gross_perfs', 'vintage', 'mid_point_lat', 'formation', 'tvd']