# Beginning Data Analysis 

In [1]:
import pandas as pd
import numpy as np
pd.set_option('max_columns', 4, 'max_rows', 10, 'max_colwidth', 12)

### EDA : 

## Introduction

## Developing a data analysis routine

### How to do it...

In [2]:
college = pd.read_csv('data/college.csv')
college.sample(random_state=42)                       # 반복에서 42의 행 또는 열을 반환

Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3649,Career P...,San Antonio,...,20700,14977


In [3]:
college.shape                                        # .shape으로 DataFrame의 차원을 알 수 있다.

(7535, 27)

In [None]:
college.info()                                       # 칼럼의 이름, 비결측치 개수, 각 칼럼의 데이터 형식과 메모리 사용량을 알 수 있다.

In [5]:
college.describe(include=[np.number]).T              # np.number타입의 데이터를 요약한다 - np.numver는 정수,부동소수점보다 상위 계층이다

Unnamed: 0,count,mean,...,75%,max
HBCU,7164.0,0.014238,...,0.000000,1.0
MENONLY,7164.0,0.009213,...,0.000000,1.0
WOMENONLY,7164.0,0.005304,...,0.000000,1.0
RELAFFIL,7535.0,0.190975,...,0.000000,1.0
SATVRMID,1185.0,522.819409,...,555.000000,765.0
...,...,...,...,...,...
PPTUG_EF,6853.0,0.226639,...,0.376900,1.0
CURROPER,7535.0,0.923291,...,1.000000,1.0
PCTPELL,6849.0,0.530643,...,0.712900,1.0
PCTFLOAN,6849.0,0.522211,...,0.745000,1.0


In [8]:
college.describe(include=[np.object]).T   # np.object타입의 데이터를 요약한다  - pd.Categorical 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  college.describe(include=[np.object]).T   # pd.Categorical


Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,Alabama ...,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacyS...,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacyS...,1510


### How it works...

### There's more...

In [9]:
college.describe(include=[np.number],                     # descrube를 이용해서 분위수를 나눌수 있다.
   percentiles=[.01, .05, .10, .25, .5,
                .75, .9, .95, .99]).T

Unnamed: 0,count,mean,...,99%,max
HBCU,7164.0,0.014238,...,1.000000,1.0
MENONLY,7164.0,0.009213,...,0.000000,1.0
WOMENONLY,7164.0,0.005304,...,0.000000,1.0
RELAFFIL,7535.0,0.190975,...,1.000000,1.0
SATVRMID,1185.0,522.819409,...,730.000000,765.0
...,...,...,...,...,...
PPTUG_EF,6853.0,0.226639,...,0.946724,1.0
CURROPER,7535.0,0.923291,...,1.000000,1.0
PCTPELL,6849.0,0.530643,...,0.993908,1.0
PCTFLOAN,6849.0,0.522211,...,0.986368,1.0


## Data dictionaries

In [10]:
pd.read_csv('data/college_data_dictionary.csv')     # 데이터딕셔너리는 메타데이터의 테이블

Unnamed: 0,column_name,description
0,INSTNM,Institut...
1,CITY,City Loc...
2,STABBR,State Ab...
3,HBCU,Historic...
4,MENONLY,0/1 Men ...
...,...,...
22,PCTPELL,Percent ...
23,PCTFLOAN,Percent ...
24,UG25ABV,Percent ...
25,MD_EARN_...,Median E...


## Reducing memory by changing data types

### How to do it...

In [35]:
college = pd.read_csv('data/college.csv')
different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER',           # 칼럼의 이름들
   'INSTNM', 'STABBR']
col2 = college.loc[:, different_cols]                           # col2 인덱스와 컬럼을 넣어줌
col2.head()

Unnamed: 0,RELAFFIL,SATMTMID,...,INSTNM,STABBR
0,0,420.0,...,Alabama ...,AL
1,0,565.0,...,Universi...,AL
2,1,,...,Amridge ...,AL
3,0,590.0,...,Universi...,AL
4,0,430.0,...,Alabama ...,AL


* 인덱스는 별도로 지정하지 않으면 기본적으로 RangeIndex로 설정

In [12]:
col2.dtypes                  # 각 열의 데이터 형식

RELAFFIL      int64
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

* integer와 float의 데이터 형식은 64비트로 기본 설정

In [13]:
original_mem = col2.memory_usage(deep=True)       # memory_usage 메소드 : 각 열의 메모리 사용량을 바이트 단위로 반환
original_mem                                       # 각 메모리의 사용량

Index          128
RELAFFIL     60280
SATMTMID     60280
CURROPER     60280
INSTNM      660240
STABBR      444565
dtype: int64

* deep :bool, 기본값은 False \
  True 인 경우 object 형식으로 된 열의 정확한 메모리양을 추출

In [17]:
col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)    # 'RELAFFIL'에는 0과 1값만 있어서 굳이 64비트가 필요없기에 8비트로 변환한다

In [18]:
col2.dtypes              # 'RELAFFIL'의 데이터 타입이 바뀌어 있다.

RELAFFIL       int8
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [24]:
col2[different_cols].memory_usage(deep=True)      # 'RELAFFIL'의 사용량이 줄어듬

Index          128
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      660699
STABBR       13120
dtype: int64

In [20]:
col2.select_dtypes(include=['object']).nunique()    # 'object'의 데이터타입의 값의 개수(.unique)를 출력

INSTNM    7535
STABBR      59
dtype: int64

In [21]:
col2['STABBR'] = col2['STABBR'].astype('category')   # 'STABBR' 값의 개수가 1% 미만이므로 데이터타입을 'category'로 변경한다
col2.dtypes

RELAFFIL        int8
SATMTMID     float64
CURROPER       int64
INSTNM        object
STABBR      category
dtype: object

In [22]:
new_mem = col2.memory_usage(deep=True)        # STABBR  444565   ->   STABBR  3120  줄어드는 것을 확인
new_mem

Index          128
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      660699
STABBR       13120
dtype: int64

In [25]:
new_mem / original_mem             # 'RELAFFIL'은 1/8(12%) 로 줄었고, 'STABBR'은 0.3프로 가까이 줄었다

Index       1.000000
RELAFFIL    0.125000
SATMTMID    1.000000
CURROPER    1.000000
INSTNM      1.000695
STABBR      0.029512
dtype: float64

### How it works...

### There's more...

In [30]:
college.loc[0, 'CURROPER'] = 10000000                            # 'CURROPER'에 10000000값을 넣고
college.loc[0, 'INSTNM'] = college.loc[0, 'INSTNM'] + 'a'        # 'INSTNM'에 'a'를 더한다
college[['CURROPER', 'INSTNM']].memory_usage(deep=True)          # 'CURROPER'은 64비트라 공간이 많아서 사용량이 그대로인데
                                                                 # 'INSTNM'은 'a'를 추가했는데 이건 왜 쪼꼼밖에 안 늘었을까.......

Index          128
CURROPER     60280
INSTNM      660702
dtype: int64

In [31]:
college['MENONLY'].dtype           # 'MENONLY'은 0과 1로 되어있지만 결측치가 있기 때문에 데이터 형식이 'float64'이다. 

dtype('float64')

In [29]:
college['MENONLY'].astype(np.int8)  # nan 값 때문에 int로 데이터형식 변환이 불가능하다

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [37]:
college.assign(MENONLY=college['MENONLY'].astype('float16'),        # assign 메소드를 이용해 데이터 타입을 변경해 준다? 모르겠다
    RELAFFIL=college['RELAFFIL'].astype('int8'))

Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama ...,Normal,...,30300,33888
1,Universi...,Birmingham,...,39700,21941.5
2,Amridge ...,Montgomery,...,40100,23370
3,Universi...,Huntsville,...,45500,24097
4,Alabama ...,Montgomery,...,26600,33118.5
...,...,...,...,...,...
7530,SAE Inst...,Emeryville,...,,9500
7531,Rasmusse...,Overland...,...,,21163
7532,National...,Highland...,...,,6333
7533,Bay Area...,San Jose,...,,PrivacyS...


In [38]:
college.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   INSTNM              7535 non-null   object 
 1   CITY                7535 non-null   object 
 2   STABBR              7535 non-null   object 
 3   HBCU                7164 non-null   float64
 4   MENONLY             7164 non-null   float64
 5   WOMENONLY           7164 non-null   float64
 6   RELAFFIL            7535 non-null   int64  
 7   SATVRMID            1185 non-null   float64
 8   SATMTMID            1196 non-null   float64
 9   DISTANCEONLY        7164 non-null   float64
 10  UGDS                6874 non-null   float64
 11  UGDS_WHITE          6874 non-null   float64
 12  UGDS_BLACK          6874 non-null   float64
 13  UGDS_HISP           6874 non-null   float64
 14  UGDS_ASIAN          6874 non-null   float64
 15  UGDS_AIAN           6874 non-null   float64
 16  UGDS_N

In [33]:
college.index = pd.Int64Index(college.index)
college.index.memory_usage() # previously was just 80 .....거짓말        # RangeIndex와 Int64Index의 차이

60280

## Selecting the smallest of the largest

### How to do it...

In [None]:
movie = pd.read_csv('data/movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie2.head()

In [None]:
movie2.nlargest(100, 'imdb_score').head()

In [None]:
(movie2
  .nlargest(100, 'imdb_score')
  .nsmallest(5, 'budget')
)

### How it works...

### There's more...

## Selecting the largest of each group by sorting

### How to do it...

In [None]:
movie = pd.read_csv('data/movie.csv')
movie[['movie_title', 'title_year', 'imdb_score']]

In [None]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values('title_year', ascending=False)
)

In [None]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values(['title_year','imdb_score'],
               ascending=False)
)

In [None]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values(['title_year','imdb_score'],
               ascending=False)
  .drop_duplicates(subset='title_year')
)

### How it works...

## There's more...

In [None]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .groupby('title_year', as_index=False)
  .apply(lambda df: df.sort_values('imdb_score',
         ascending=False).head(1))
  .sort_values('title_year', ascending=False)
)

In [None]:
(movie
  [['movie_title', 'title_year',
    'content_rating', 'budget']]
   .sort_values(['title_year',
       'content_rating', 'budget'],
       ascending=[False, False, True])
   .drop_duplicates(subset=['title_year',
        'content_rating'])
)

## Replicating nlargest with sort_values

### How to do it...

In [None]:
movie = pd.read_csv('data/movie.csv')
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .nlargest(100, 'imdb_score') 
   .nsmallest(5, 'budget')
)

In [None]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False)
   .head(100)
)

In [None]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False)
   .head(100) 
   .sort_values('budget')
   .head(5)
)

### How it works...

In [None]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .nlargest(100, 'imdb_score')
   .tail()
)

In [None]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False) 
   .head(100)
   .tail()
)

## Calculating a trailing stop order price

### How to do it...

In [None]:
import datetime
import pandas_datareader.data as web
import requests_cache
session = requests_cache.CachedSession(
   cache_name='cache', backend='sqlite', 
   expire_after=datetime.timedelta(days=90))

In [None]:
tsla = web.DataReader('tsla', data_source='yahoo',
   start='2017-1-1', session=session)
tsla.head(8)

In [None]:
tsla_close = tsla['Close']

In [None]:
tsla_cummax = tsla_close.cummax()
tsla_cummax.head()

In [None]:
(tsla
  ['Close']
  .cummax()
  .mul(.9)
  .head()
)

### How it works...

### There's more...