## 정렬
* 데이터 타입에 따른 정렬 함수
    * dataframge정렬 srot_values()
    * list정렬 list.srot() sorted(list)
    * tuple정렬 sorted(tuple.key) <br><br>

* 정렬 알고리즘
    * 선택, 버블, 삽입, ... : O(n^2)
    * 퀵, 병합, 힙, ... : O(nlogn)
    * 계수(O(m+n)) → 빠르지만 정렬할 수 없는 데이터도 존재

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [5]:
df = pd.DataFrame({"s" : [1, 3, 2],
                   "name": ["chio", "kim", "lee"],
                   "age": [30, 20, 40]})
df

Unnamed: 0,s,name,age
0,1,chio,30
1,3,kim,20
2,2,lee,40


In [6]:
df.sort_values()

TypeError: sort_values() missing 1 required positional argument: 'by'

In [7]:
df.sort_values(by="s")

Unnamed: 0,s,name,age
0,1,chio,30
2,2,lee,40
1,3,kim,20


In [8]:
df.sort_values(by="s", ascending=False)

Unnamed: 0,s,name,age
1,3,kim,20
2,2,lee,40
0,1,chio,30


In [11]:
df.sort_values(by="s", ascending=False, kind="quicksort") # 정렬방식 지정 옵션

Unnamed: 0,s,name,age
1,3,kim,20
2,2,lee,40
0,1,chio,30


In [13]:
# 데이터를 정렬해서 저장할 것인지 지정하는 옵션 default : False
df.sort_values(by="s", ascending=False, kind="quicksort", inplace=True) 
df

Unnamed: 0,s,name,age
1,3,kim,20
2,2,lee,40
0,1,chio,30


In [16]:
tp = [("1", "park", 30),
      ("3", "lee", 20),
      ("2", "chio", 40)]
tp

[('1', 'park', 30), ('3', 'lee', 20), ('2', 'chio', 40)]

In [18]:
sorted(tp, key=lambda la:la[0]) # 0번 index를 기준으로 정렬

[('1', 'park', 30), ('2', 'chio', 40), ('3', 'lee', 20)]

In [19]:
sorted(tp, key=lambda la:la[1]) # 0번 index를 기준으로 정렬

[('2', 'chio', 40), ('3', 'lee', 20), ('1', 'park', 30)]

In [21]:
mylist=[0, 9, 7, 2, 4]
sorted(mylist)

[0, 2, 4, 7, 9]

In [22]:
mylist.sort()
mylist

[0, 2, 4, 7, 9]

|  <center></center> |  <center>데이터프레임(df)</center> |  <center>시리즈(s)</center> |
|:--------|:--------:|--------:|
|**행의 개수 세기** | <center>len(df)<br>df.shape[0]<br>len(df.index)</center> |<center>len(s)<br>s.size<br>len(s.index)</center> |
|**열 개수 세기** | <center>df.shape[1]<br>len(df.columns)</center> |<center>X</center> |
|**NULL이 아닌 행의 개수** | <center>df.count()</center> |<center>s.count()</center> |
|**그룹 단위 행 개수** | <center>df.groupby().size</center> |<center>s.groupby().size</center> |
|**그룹 단위 NULL이 아닌 행의 개수** | <center>df.groupby().count() </center> |<center>s.groupby().count()</center> |

In [47]:
df = pd.DataFrame({'grp': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C'],  
                   'val': [1, 2, np.nan, 4, np.nan, np.nan, 7, 8, 9]}) 
s = pd.Series([1, 2, np.nan, 4, np.nan, np.nan, 7, 8, 9])

In [24]:
df

Unnamed: 0,grp,val
0,A,1.0
1,A,2.0
2,A,
3,B,4.0
4,B,
5,B,
6,C,7.0
7,C,8.0
8,C,9.0


In [25]:
s

0    1.0
1    2.0
2    NaN
3    4.0
4    NaN
5    NaN
6    7.0
7    8.0
8    9.0
dtype: float64

In [28]:
print(len(df))
print(df.shape[0])
print(len(df.index))

9
9
9


In [32]:
print(len(s))
print(s.size)
print(len(s.index))

9
9
9


In [34]:
print(df.shape[1])
print(len(df.columns))

2
2


In [38]:
print(df.count())
print(df["val"].count())
print(s.count())

grp    9
val    6
dtype: int64
6
6


In [41]:
print(df.groupby("grp").size()) # NaN 포함한 각 그룹의 행 개수
print(df.groupby("grp").count()) # NaN을 제외한 각 그룹의 행 개수

grp
A    3
B    3
C    3
dtype: int64
     val
grp     
A      2
B      1
C      3


In [43]:
print(s.groupby(df.grp).size()) 
print(s.groupby(df.grp).count()) 

grp
A    3
B    3
C    3
dtype: int64
grp
A    2
B    1
C    3
dtype: int64


## 열의 특정값을 추출하여 파생변수 만들기

In [49]:
df = pd.DataFrame({'id': ['A_001', 'A_002', 'A_003', 'B_001', 'C_001', 'C_002'],  
                   'val': np.arange(6)})
df

Unnamed: 0,id,val
0,A_001,0
1,A_002,1
2,A_003,2
3,B_001,3
4,C_001,4
5,C_002,5


In [50]:
df.id

0    A_001
1    A_002
2    A_003
3    B_001
4    C_001
5    C_002
Name: id, dtype: object

In [53]:
df.id.split("_")

AttributeError: 'Series' object has no attribute 'split'

In [55]:
# Series → String
df.id.str.split("_")

0    [A, 001]
1    [A, 002]
2    [A, 003]
3    [B, 001]
4    [C, 001]
5    [C, 002]
Name: id, dtype: object

In [61]:
df.id.str.split("_")[0] # 0번 index의 행 데이터

['A', '001']

In [64]:
df["g"] = df.id.str.split("_").str[0]
df

Unnamed: 0,id,val,g
0,A_001,0,A
1,A_002,1,A
2,A_003,2,A
3,B_001,3,B
4,C_001,4,C
5,C_002,5,C


In [65]:
type(df.id.str.split("_").str[0])

pandas.core.series.Series

In [66]:
# Series → list
df.id.str.split("_").str[0].tolist()

['A', 'A', 'A', 'B', 'C', 'C']

In [78]:
for i in range(df.shape[0]): # df의 row의 개수
    print(df.loc[i, "id"])

A_001
A_002
A_003
B_001
C_001
C_002


In [77]:
for i in range(df.shape[0]): 
    print(type(df.loc[i, "id"]))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [80]:
for i in range(df.shape[0]): 
    print(df.loc[i, "id"].split("_")[0])

A
A
A
B
C
C


In [83]:
for i in range(df.shape[0]): 
    df.loc[i, "gg"] = df.loc[i, "id"].split("_")[0]
df

Unnamed: 0,id,val,g,gg
0,A_001,0,A,A
1,A_002,1,A,A
2,A_003,2,A,A
3,B_001,3,B,B
4,C_001,4,C,C
5,C_002,5,C,C


In [85]:
s = pd.Series([10, 20, 30, 40, 50])
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [86]:
s[2]

30

In [87]:
s[2:]

2    30
3    40
4    50
dtype: int64

In [88]:
s[:3]

0    10
1    20
2    30
dtype: int64

In [89]:
s[s >= s.mean()]

2    30
3    40
4    50
dtype: int64

In [91]:
s[[2, 3]]

2    30
3    40
dtype: int64

In [94]:
s = pd.Series([10, 20, 30, 40, 50], index=["a", "b", "c", "d", "e"])
s.ix["c"]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


30

In [95]:
s.ix[["a", "c"]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


a    10
c    30
dtype: int64

In [96]:
s.get(["a", "c"])

a    10
c    30
dtype: int64

In [97]:
s["b"] = 200
s

a     10
b    200
c     30
d     40
e     50
dtype: int64

In [99]:
# d index 존재 여부
"d" in s

True

In [100]:
df = pd.DataFrame({"c1": [1, 3],
                   "c2": [2, 4]})
df

Unnamed: 0,c1,c2
0,1,2
1,3,4


In [103]:
df = df.assign(c3 = df["c1"]*df["c2"])
df

Unnamed: 0,c1,c2,c3
0,1,2,2
1,3,4,12


## DataFrame에서 col 삭제

In [104]:
df.drop(["c3"], 1)

Unnamed: 0,c1,c2
0,1,2
1,3,4


In [107]:
df # 데이터 자체가 사라지지 않는다

Unnamed: 0,c1,c2,c3
0,1,2,2
1,3,4,12


In [108]:
del df["c3"]

In [109]:
df # 데이터가 갱신됨

Unnamed: 0,c1,c2
0,1,2
1,3,4


## TEXT → WORD
활용 데이터 출처 
* https://en.wikipedia.org/wiki/Python_(programming_language) <br>
* https://www.kaggle.com/rodolfomendes/abalone-dataset 

In [133]:
file_opened = open("./res/python_wikipedia.txt")
file_opened

<_io.TextIOWrapper name='./res/python_wikipedia.txt' mode='r' encoding='cp949'>

In [134]:
for line in file_opened.readlines():
    print(line)

Python programming language, from wikipedia

Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aims to help programmers write clear, logical code for small and large-scale projects.[26]

Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python is often described as a "batteries included" language due to its comprehensive standard library.[27]

Python was conceived in the late 1980s as a successor to the ABC language. Python 2.0, released 2000, introduced features like list comprehensions and a garbage collection system capable of collecting reference cycles. Python 3.0, released 2008, was a major revision of the language that is not completel

In [130]:
# 전처리 (소문자, stopword, 특수문자, 대괄호, ..., 숫자)
def word_preprocess(word):
    word = word.lower()
    
    stop_words = ['a', 'an', 'the', 'in', 'with', 'to', 'for', 'from', 'of', 'at', 'on', 'until', 'by', 'and', 'but', 'is', 'are', 'was', 'were', 'it', 'that', 'this', 'my', 'his', 'her', 'our', 'as', 'not'] 
    symbols = [',', '.', ':', '-', '+', '/', '*', '&', '%', '[', ']', '(', ')'] 
    
    for stop_word in stop_words:
        if word != stop_word:
            wrod = word
        else:
            word = ""
    
    for symbol in symbols:
        word.replace(symbol, "") # 단어 내부에 존재하는 symbol도 대체
    
    return word

In [144]:
token_inx = {}
file_opened = open("./res/python_wikipedia.txt")
for line in file_opened.readlines():
    # 공백 제거
    line.strip("") # 문좌 좌우의 공백 제거
    
    for word in line.split():
        word = word_preprocess(word)
        if word not in token_inx: # 기준이 되는 것은 Key
            if word != "":
                # 단어마다 index를 부여하려면
                #token_inx[word] : len(toekn_inx) + 1
                token_inx[word] = 1
        else: # word가 이미 존재한다면
                token_inx[word] = token_inx[word] + 1
print(token_inx)

{'python': 14, 'programming': 3, 'language,': 1, 'wikipedia': 1, 'interpreted,': 1, 'high-level,': 1, 'general-purpose': 1, 'language.': 2, 'created': 1, 'guido': 2, 'van': 2, 'rossum': 2, 'first': 1, 'released': 3, '1991,': 1, "python's": 1, 'design': 1, 'philosophy': 1, 'emphasizes': 1, 'code': 4, 'readability': 1, 'its': 3, 'notable': 1, 'use': 1, 'significant': 1, 'whitespace.': 1, 'language': 4, 'constructs': 1, 'object-oriented': 1, 'approach': 1, 'aims': 1, 'help': 1, 'programmers': 2, 'write': 1, 'clear,': 1, 'logical': 1, 'small': 1, 'large-scale': 1, 'projects.[26]': 1, 'dynamically': 1, 'typed': 1, 'garbage-collected.': 1, 'supports': 1, 'multiple': 1, 'paradigms,': 1, 'including': 1, 'procedural,': 1, 'object-oriented,': 1, 'functional': 1, 'programming.': 1, 'often': 1, 'described': 1, '"batteries': 1, 'included"': 1, 'due': 2, 'comprehensive': 1, 'standard': 1, 'library.[27]': 1, 'conceived': 1, 'late': 1, '1980s': 1, 'successor': 1, 'abc': 1, '2.0,': 1, '2000,': 1, 'intr

In [147]:
np.random.normal(5, size=3) # 5를 평균으로 정규분포를 따르는 값 3개 무작위로 생성

array([3.99556523, 4.1301568 , 4.68676899])

In [149]:
np.random.seed(708)
np.random.normal(size=10)

array([-0.44701866,  0.42271748,  0.35076078, -0.15413394, -1.5020158 ,
        1.30307833,  1.00748836,  0.06666045, -1.47430939, -0.39288867])

## 이항분포
binomial : 이상분포로부터 무작위 표본 추출
* 이산형/연속형 확률 분포
    1. 이산형(이상, 포아송, ...)
    2. 연속형(정규, t분포, 균등, f, 카이제곱, ...)<br>
    
성공(1), 실패(0)<br>
성공확률을 p, 베르누이 시행을 n번 수성공하는 횟수 X<br>
확률 변수 X는 모수(전체 시행회수) n과 p인 이상분포를 따른다 라고 표현<br>
$$f(x) = nCx p^x (1-p)^{(n-x)}$$
$$x = 0,1, ..., n$$


In [153]:
sum(np.random.binomial(n=1, p=0.5, size=20)) / 20 # n = 성공(1)

0.55

In [155]:
#np.random.normal(mu, sigma, size) 평균, 표준편차, 개수
np.random.normal(0, 3, 100)

array([-0.63574142,  0.18675766, -1.87080797, -1.21456478, -0.43458814,
       -1.25167771, -2.20515387, -2.44936339,  3.98588931,  3.54156261,
       -2.49184564, -2.00578431,  3.46666957,  1.75534793, -1.85587612,
        4.11226051, -1.82249905,  5.08921616, -2.6942552 , -4.24375425,
        7.36352146,  0.28220585, -0.27808825,  4.23707537,  1.80913906,
       -4.06653688, -0.92869466,  3.34404997,  2.04035521,  0.10898017,
        0.98188226, -1.48505134,  1.14984669, -3.656114  ,  2.22571485,
       -1.75208816, -0.99799926,  0.7927612 ,  2.83020632, -3.17093664,
       -0.63203011, -5.48890198,  2.8050894 , -3.95254499, -0.53223889,
        2.17131378,  0.20490759, -4.58187782, -1.97621464,  2.21116709,
       -1.9702217 ,  0.02622525, -1.37888258, -0.95812311,  1.84868425,
       -6.5452267 ,  2.62444453,  0.75852764, -5.52046425, -0.12136669,
        4.91738766, -1.57253926,  0.03851431, -1.10083278, -0.38375785,
       -6.49273336, -1.76021681, -5.0190194 ,  0.09651377, -2.97

## 분산
평균으로부터 어느정도 떨어져있는가 → 면적 사용 

## 공분산
어떤 한 변수가 변화가 있을 때 다른 변수는 어떻게 변하는가
두 변수사이의 관계 규명

데이터의 개수로는 변수사이의 관계를 정의할 수 없다

표준화 : 특정 변수의 영향을 최소화하기 위해서

## 유의성 검정
어느정도 신뢰할 수 있는가?
1. P-value
2. T-test
