In [1]:
import pandas as pd
import numpy as np

In [70]:
# 1. groupby count null value 
df1 = pd.DataFrame(dict(A=[1,2]*3, B=[1,2,None,4,None,None]))
df1
df1['B'].isnull().groupby(df1['A']).sum()
df1.groupby(by = 'A')['B'].apply(lambda x: x.isnull().sum())
df['A'][df1['B'].isnull()].value_counts()

1    2
2    1
Name: A, dtype: int64

In [71]:
# from 2 to 6: https://codeutility.org/python-identifying-consecutive-nans-with-pandas-stack-overflow/
# 2. Get the length of consecutive nan
data = pd.Series([np.nan,1,1,1,1,1,np.nan,np.nan,np.nan,1,1,np.nan,np.nan])
na_groups = data.notna().cumsum()[data.isna()] # notna로 cumsum진행 -> isna로 boolean 인덱싱 수행. 
lengths_consecutive_na = na_groups.groupby(na_groups).agg(len) # 중복된 값의 개수 = na의 개수
longest_na_gap = lengths_consecutive_na.max() # 가장 큰 수
longest_na_gap

3

In [72]:
# 3. count total number of nan in certain column
# two way:
# 1) len(column) - column.count()
# 2) column.isnull().sum()
print(len(df1['B']) - df1['B'].count())
print(df1['B'].isnull().sum())

3
3


In [73]:
# 4. find where consecutive null occurs
df2 = pd.DataFrame({'a':[1,3,np.NaN, np.NaN, 4, np.NaN, 6,7,8]})
df2[df2['a'].isnull() & df2['a'].shift(1).isnull()]

Unnamed: 0,a
3,


In [74]:
# 5. add count consecutive null column
df3 = pd.DataFrame({'a':[1,2,np.NaN, np.NaN, np.NaN, 6,7,8,9,10,np.NaN,np.NaN,13,14]})
df3['a'].isnull().astype(int).groupby(df3['a'].notnull().astype(int).cumsum()).sum()
## 해석: notnull -> cumsum을 진행하면, 값이 동일한 행들이 존재. 그 동일한 행들이 바로 null의 index.
## isnull -> null이 존재하는 index에 1이 존재하고 있음.
## notnull.astype(int).cumsum을 기준으로 null을 groupby하면, 아래와 같이 그룹핑 됨.
## 1:[0], 2:[0, 1, 1, 1], 3:[0], 4:[0], 5:[0], 6:[0], 7:[0,1,1], 8:[0], 9:[0]
## 그러므로 그뤂핑된 객체에 sum을 진행하면, consecutive null 의 총 '개수'를 알 수 있음.
## 만약 consecutive null 이 발생한 원래 index 위치를 알고 싶다면? sum을 cumsum으로 바꿔주면 해결
df3['a'].isnull().astype(int).groupby(df3['a'].notnull().astype(int).cumsum()).cumsum()
# 해당 결과를 기존 테이블에 concat 진행
pd.concat([df3, 
        (
         df3['a'].isnull().astype(int)
          .groupby(df3['a'].notnull().astype(int).cumsum())
          .cumsum().to_frame('consec_count')
        )
          ], axis = 1)

Unnamed: 0,a,consec_count
0,1.0,0
1,2.0,0
2,,1
3,,2
4,,3
5,6.0,0
6,7.0,0
7,8.0,0
8,9.0,0
9,10.0,0


In [75]:
# 6. count consecutive nan of groupby series object
def count_consec_nan(array):
    consec_cnt = array.isnull().astype(int).groupby(array.notnull().astype(int).cumsum()).cumsum()
    return consec_cnt
# groupby series object는 isnull 함수가 없기 때문에, 다음과 같은 code는 실행이 되지 않는다.
# df.groupy('A')['B'].agg(count_consec_nan_func)
df4 = pd.DataFrame({'a':[1,3,np.NaN, np.NaN, 4, np.NaN, 6,7,8],
                   'b': ['q','w','q','q','q','w','q','q','w']})
df4['new'] = df4.groupby('b')['a'].transform(count_consec_nan)
df4

# 6.1. count consecutive same value of groupby series object
# (df['price_avg'] == df['price_avg_shift_']).groupby(
#     (df['price_avg'] != df['price_avg_shift_']).cumsum()
# ).cumsum()

Unnamed: 0,a,b,new
0,1.0,q,0
1,3.0,w,0
2,,q,1
3,,q,2
4,4.0,q,0
5,,w,1
6,6.0,q,0
7,7.0,q,0
8,8.0,w,0


In [2]:
# 7. exceed abnormal range
sample_df = pd.read_csv('/Users/cho-eungi/Practice/CSV/airline_stats.csv')
sample_df

Unnamed: 0,pct_carrier_delay,pct_atc_delay,pct_weather_delay,airline
0,8.153226,1.971774,0.762097,American
1,5.959924,3.706107,1.585878,American
2,7.157270,2.706231,2.026706,American
3,12.100000,11.033333,0.000000,American
4,7.333333,3.365591,1.774194,American
...,...,...,...,...
33463,6.186422,8.798491,1.651940,Southwest
33464,9.522167,3.591133,0.261084,Southwest
33465,9.164179,2.664179,0.343284,Southwest
33466,5.152293,1.964520,0.122817,Southwest


In [128]:
def exceed_iqr(array):
    iqr = array.quantile(.75) - array.quantile(.25)
    threshold_max = array.quantile(.75) + 1.5*iqr
    threshold_min = array.quantile(.25) - 1.5*iqr
    if threshold_min < 0:
        threshold_min = 0
    
    check_col = []
    for i in array:
        if i > threshold_max or i < threshold_min:
            check_col.append(1)
        else:
            check_col.append(0)
    return np.asarray(check_col)
    
sample_df['new'] = sample_df.groupby(by = 'airline')['pct_carrier_delay'].transform(exceed_iqr)

sample_df.loc[sample_df['new'] == 1]

Unnamed: 0,pct_carrier_delay,pct_atc_delay,pct_weather_delay,airline,new
22,21.419355,4.709677,1.419355,American,1
192,15.580645,3.774194,0.000000,Delta,1
199,25.000000,25.000000,0.000000,Delta,1
238,24.000000,9.333333,0.000000,Delta,1
246,13.903226,5.451613,0.000000,Delta,1
...,...,...,...,...,...
33075,8.866667,27.800000,0.000000,Alaska,1
33083,9.713483,5.859551,0.859551,Alaska,1
33089,9.396552,6.931034,3.241379,Alaska,1
33096,11.111111,0.000000,0.000000,Alaska,1


In [138]:
# 8. count vs value_counts // groupby + function
print(sample_df.count())
print(sample_df['pct_carrier_delay'].value_counts())
print(sample_df['pct_carrier_delay'].value_counts(bins = 10))

print(sample_df.groupby(by = 'airline').groups) # group + each index
def compare(idx):
    return sample_df.loc[idx].pct_carrier_delay > sample_df.loc[idx].pct_atc_delay
sample_df.groupby(compare).groups

pct_carrier_delay    33440
pct_atc_delay        33440
pct_weather_delay    33440
airline              33468
new                  33468
dtype: int64
0.000000     982
3.225806     134
3.333333      80
6.451613      54
6.666667      46
            ... 
10.346154      1
13.840708      1
5.776187       1
2.272619       1
3.964393       1
Name: pct_carrier_delay, Length: 26916, dtype: int64
(-0.101, 10.0]    26919
(10.0, 20.0]       6129
(20.0, 30.0]        325
(30.0, 40.0]         43
(40.0, 50.0]         14
(90.0, 100.0]         5
(60.0, 70.0]          2
(50.0, 60.0]          1
(70.0, 80.0]          1
(80.0, 90.0]          1
Name: pct_carrier_delay, dtype: int64
{'Alaska': [79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523

{False: [6, 10, 15, 19, 20, 24, 26, 32, 33, 34, 35, 36, 37, 46, 50, 66, 79, 80, 82, 83, 84, 85, 86, 90, 91, 92, 93, 96, 97, 98, 99, 101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 113, 115, 117, 119, 120, 122, 124, 129, 130, 137, 138, 143, 145, 146, 147, 148, 149, 151, 154, 156, 158, 161, 162, 164, 169, 172, 174, 176, 177, 178, 183, 184, 185, 186, 189, 190, 193, 194, 196, 198, 199, 200, 202, 203, 204, 206, 207, 211, 212, 213, 214, 216, 217, 219, 221, 222, 223, 226, 227, ...], True: [0, 1, 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 21, 22, 23, 25, 27, 28, 29, 30, 31, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 81, 87, 88, 89, 94, 95, 100, 107, 112, 114, 116, 118, 121, 123, 125, 126, 127, 128, 131, 132, 133, 134, 135, 136, 139, 140, 141, 142, 144, 150, 152, 153, 155, 157, 159, 160, 163, ...]}

In [6]:
class Price_Alarm(object):
    alarm_4_threshold = 8
    
    def groupby_array(self, column1, column2):
        return self.groupby(by = column1)[column2]
    
    def alarm_1(self):
        # price_avg_shift 와 price_avg 컬럼 모두 null 이 아닌 경우에만 비율 계산. 아니면 0
        self['price_avg_chg'] = np.where(
            (self['price_avg'].notnull()) & (self['price_avg_shift'].notnull())
            , self['price_avg'] / self['price_avg'].shift(1)
            , 0)
        self['rank'] = self.groupby_array('entry_id', 'price_avg')['date'].rank("dense", ascending = True)
        self['price_avg_chg_'] = np.where(df['rank'] == 1, np.nan, df['price_avg_chg'])
        self.drop(['price_avg_chg', 'rank'], axis = 1, inplace = True)
        self['alarm1'] = np.where(df['price_avg_chg_'] > 2, 1, 0)
        
    def alarm_2(self):
        self['conosec_null3'] = self.groupby_array('entry_id', 'price_abg').transform(count_consec_nan)
        self['alarm2'] = np.where(self['consec_null3'] >=3, 1, 0)\

    def alarm_3(self):
        group = self.groupby_array("airline", "pct_atc_delay")
        self['alarm3'] = group.transform(exceed_3sigma)
        
    def alarm_4(self):
        self['rank_'] = self.groupby_array('entry_id','date').rank("dense", ascending = True)
        self['price_avg_shift'] = np.where(df['rank_'] == 1, np.nan, df['price_avg'].shift(1))
        self.drop(['price_avg', 'rank_'], axis = 1, inplace = True)
        self['consec_count_same'] = (self['price_avg'] == self['price_avg_shift']).groupby(
            (self['price_avg'] != self['price_avg_shift_']).cumsum()
        ).cumsum()
        self['alarm4'] = np.where(self['consec_count_same'] >= Price_Alarm.alarm_4_threshold, 1, 0)

In [7]:
sample_df.head()

Unnamed: 0,pct_carrier_delay,pct_atc_delay,pct_weather_delay,airline
0,8.153226,1.971774,0.762097,American
1,5.959924,3.706107,1.585878,American
2,7.15727,2.706231,2.026706,American
3,12.1,11.033333,0.0,American
4,7.333333,3.365591,1.774194,American


In [None]:
sample = Price_Alarm(sample_df)