![alt_txt](./logo.png)

# Feature Extraction

In [38]:
import numpy as np
import pandas as pd
import os

<img src="../images/image_1_features.PNG" alt="abc"
	title="Scheme1" width="600" height="400"/>
 <img src="../images/image_2_features.PNG" alt="abc"
	title="Scheme1" width="600" height="400"/> 

Cap(battery_data, start_cycle=2, end_cycle=100)  
seven_to_ten(data, cycle=100)  
Temp_minmax(data, start_cycle=2,end_cycle=100)  
q_parameter_wrapped(df,100)
ChargeAverageTime(df2,100)
Cap(battery_data, start_cycle=2, end_cycle=100)  
IR(battery_data, start_cycle=2, end_cycle=100)  

In [2]:
def Temp_minmax(data, start_cycle=2,end_cycle=100):
    """
    calculate the temperature min and max and intergral between the start and end cycle
    """
    from scipy import integrate
    for data['Cycle_Index'] in range(start_cycle,end_cycle):
        Max_T=data['T'].max()
        Min_T=data['T'].min()
        
    T_Value = []
    t_Value = []
    for data['Cycle_Index'] in range (2,200):
        T_Value = data['T']
        t_Value = data['t']
    T_Integ = integrate.trapz(T_Value, t_Value)
    
    return [Max_T,Min_T,T_Integ]
#Temp_minmax(df)

In [3]:
def Cap(battery_data, start_cycle=2, end_cycle=100):
    """
    Cap is for calculating features of capacity of a certain cell
    ---------
    battery_data is the variable of reading the raw data by pd.read_filetype(); start_cycle is the start cycle number for the training process; end_cycle is the end cycle for the training process.
    ---------
    capacity_discharge_cycle_s is the discharge capacity of the start cycle; capacity_discharge_cycle_e is the discharge capacity of the end cycle; the delta_capacity is the differece between the max capacity during the whole battery cycles and the capacity of the start cycle.
    """
    data=battery_data
    a=start_cycle
    b=end_cycle
    cap_s=data['QD'].loc[a]
    cap_e=data['QD'].loc[b]
    delta_cap=data['QD'].max()-cap_e
    return [cap_s, cap_e, delta_cap]
#Cap(df2)

In [4]:
def IR(battery_data, start_cycle=2, end_cycle=100):
    """
    IR is for calculating features of internal resistance of a certain cell
    ---------
    battery_data is the variable of reading the raw data by pd.read_filetype(); start_cycle is the start cycle number for the training process; end_cycle is the end cycle for the training process.
    ---------
    IR_s is the internal resistance of the start cycle; Min_IR is the minimun internal resistance during the whole cycles; delta_IR is the difference between the internal resistance of the end cycle and the start cycle for training process.
    """
    data=battery_data
    a=start_cycle
    b=end_cycle
    IR_s=data['IR'].loc[a]
    Min_IR=data[data['IR'] > 0]['IR'].min()
    delta_IR= data['IR'].loc[b]-IR_s
    return [IR_s, Min_IR, delta_IR]
#IR(df2)

In [5]:
from scipy import stats
def line_fit(data, start, end):
    """
    Function for fitting the curve Q(n) at various cycles (Features 9-12)
    
    ------
    
    Inputs
    
    function: function of interest
    
    start: first cycle of interest
    
    end: last cycle of interest
    
    """
    cycles = data['cycle']
    q = data['QD']
    x = np.asarray(cycles[start:end])
    y = np.asarray(q[start:end])
    slope, intercept, r_value, p_value, std_error = stats.linregress(x, y)
    
    return slope, intercept
def seven_to_ten(data, cycle=100):
    
    _seven,_eight= line_fit(data, 2, cycle)
    _nine,_ten= line_fit(data,cycle-10,cycle)
    
    return [_seven,_eight,_nine,_ten]
#seven_to_ten(df2)

In [6]:
from scipy.interpolate import interp1d

def discharge_curve_prep(data, higher_cycle):
    cycles = data.groupby('Cycle_Index')
    cycle_dict = {}

    for i in cycles:
        cycle_dict[i[0]] = i[1]
    
    dc_higher = cycle_dict[higher_cycle][cycle_dict[higher_cycle]['I'] < 0][
        cycle_dict[higher_cycle][cycle_dict[higher_cycle]['I'] < 0]['I'] < cycle_dict[higher_cycle][cycle_dict[higher_cycle]['I'] < 0]['I'].mean()]
    dc10 = cycle_dict[10][cycle_dict[10]['I'] < 10][
        cycle_dict[10][cycle_dict[10]['I'] < 10]['I'] < cycle_dict[10][cycle_dict[10]['I'] < 0]['I'].mean()]
    
    return(dc10, dc_higher)

def q_finder(dc10, dc_higher):
    v_low = 2.01
    v_high = 3.4
    npoint = 500

    x_common = np.linspace(v_low, v_high, num=npoint, endpoint=True)

    f10 = interp1d(dc10['V'], dc10['Qd'])
    f_higher = interp1d(dc_higher['V'], dc_higher['Qd'])
    dy_common = f_higher(x_common) - f10(x_common)
    
    return x_common, dy_common

def extract_q_features(V, Q):
    from math import log
    from math import sqrt
    import numpy as np

    # Minimum
    qmin = log(abs(min(Q)))

    # Mean
    qmean = log(abs(Q.mean()))
    
    # Variance
    Q_diff = Q - qmean
    Q_diff_sq_sum = sum((Q_diff)**2)    
    variance = log(abs(Q_diff_sq_sum/(len(Q)-1)))

    # Skewness

    s = sum(Q_diff**3)/(len(Q)-1)/(sqrt(sum(Q_diff**2))**3)
    skewness = log(abs(s))

    # Kurtosis

    k = sum((Q_diff**4)/(len(Q)-1)/(sum(Q_diff**2)/(len(Q)-1))**2)    
    kurtosis = log(abs(k))
    
    # Value at 2 V
    
    index = int(np.argwhere(np.isclose(V, 2.01))[0][0])
    value_at_2_v = log(abs(Q[index]))
    
    return(qmin, qmean, variance, skewness, kurtosis, value_at_2_v)

def q_parameter_wrapped(data, cycle):
    """
    Wrapper function that looks at the comparison between Cycle 100 and Cycle 10 of any given cell and returns 
    the six parameters that are related to the dQ curve.
    """
    discharge_10, discharge_higher = discharge_curve_prep(data,higher_cycle=cycle)
    V, Q = q_finder(discharge_10, discharge_higher)
    qmin, qmean, qvariance, dskewness, qkurtosis, value_at_2_v= extract_q_features(V, Q)
    
    return [qmin, qmean, qvariance, dskewness, qkurtosis, value_at_2_v]
#q_parameter_wrapped(df,100)

In [7]:
def ChargeAverageTime(data,n):
    """
    Description
    AverageCharge Time
    ------------
    Inputs (n): cycle number    
    Outputs mean Charge_time
    
    """
    return [data.iloc[2:n]['chargetime'].mean()]

In [8]:
def feature_fun(cycling_data,summary_data,cycle=100):
    feature_list = []
    # first 6 features
    feature_list = feature_list + q_parameter_wrapped(cycling_data,cycle)
    # 7-10 features
    feature_list = feature_list + seven_to_ten(summary_data, cycle=cycle) 
    # 3 features about temperature
    feature_list = feature_list + Temp_minmax(cycling_data, start_cycle=2,end_cycle=cycle)  
    # averge_charge_time
    feature_list = feature_list + ChargeAverageTime(summary_data,cycle)
    # 3 capacity features
    feature_list = feature_list + Cap(summary_data, start_cycle=2, end_cycle=cycle)  
    # 3 IR features
    feature_list = feature_list + IR(summary_data, start_cycle=2, end_cycle=cycle)  
    return feature_list

###  load data from training dataset

In [36]:
def feature_engineering(path,folder,target,cycle=100):
    file_list = os.listdir(path)
    filelist =[]
    for file in os.listdir(path):
        if file.split('.')[1]== 'csv':
            filelist.append(file)
    
    feature_matrix=[]
    label=[]
    for num in range((len(filelist))//2):
        # reading data from the list 
        cycling_data = pd.read_csv(path+filelist[num*2])
        summary_data = pd.read_csv(path+filelist[num*2+1])
        print ('reading cell:',filelist[num*2].split('_')[0])
        # print ("features are:")
        features = feature_fun(cycling_data,summary_data,cycle=cycle)
        feature_matrix.append(features)
        label.append(len(summary_data))
        # print (features)
        # print (len(features))
    print(np.array(feature_matrix).shape)
    if os.path.isdir(folder)!=True:
        os.mkdir(folder)
    pd.DataFrame(feature_matrix).to_csv(folder+target+'_feature_matrix.csv',index=False, header=False)
    pd.DataFrame(label).to_csv(folder+target+'_label.csv',index=False, header=False)
    #pd(feature_matrix,).to_csv('feature_matrix.csv')
    
    
    

In [37]:
for ls in [80,90,100,110,120]:
    feature_engineering('../Data/','../cycle'+str(ls)+'/','train',cycle=ls)
    feature_engineering('../Data_test/','../cycle'+str(ls)+'/','test',cycle=ls)

reading cell: b1c0
reading cell: b1c11
reading cell: b1c14
reading cell: b1c15
reading cell: b1c16
reading cell: b1c17
reading cell: b1c18
reading cell: b1c19
reading cell: b1c1
reading cell: b1c20
reading cell: b1c21
reading cell: b1c23
reading cell: b1c24
reading cell: b1c25
reading cell: b1c26
reading cell: b1c27
reading cell: b1c28
reading cell: b1c29
reading cell: b1c2
reading cell: b1c30
reading cell: b1c31
reading cell: b1c32
reading cell: b1c33
reading cell: b1c34
reading cell: b1c35
reading cell: b1c36
reading cell: b1c37
reading cell: b1c38
reading cell: b1c39
reading cell: b1c3
reading cell: b1c40
reading cell: b1c41
reading cell: b1c42
reading cell: b1c43
reading cell: b1c44
reading cell: b1c45
reading cell: b1c4
reading cell: b1c5
reading cell: b1c6
reading cell: b1c7
reading cell: b1c9
(41, 20)
reading cell: b2c0
reading cell: b2c10
reading cell: b2c11
reading cell: b2c12
reading cell: b2c13
reading cell: b2c14
reading cell: b2c17
reading cell: b2c18
reading cell: b2c19
r

In [35]:
os.path.isdir('../cycle80/')==True

True