데이터 분석을 위한 패키지

In [1]:
import numpy as np # 선형대수와 통계기능을 제공하는 패키지
import pandas as pd # 행렬기반의 2차원 데이터 처리에 특화
import sklearn  # 분석을 위한 머신러닝 알고리즘과 편리한 API제공
import matplotlib.pyplot as plt # 시각화 패키지

In [2]:
# 의사결정나무 분류모델을 위한 패키지 임포트
from sklearn.tree import DecisionTreeClassifier

# 학습 및 테스트 데이터셋 분리를 위한 패키지 임포트
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("http://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv")

In [4]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [5]:
# 요약정보
df.shape
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [9]:
# 텍스트로 되어있는 species 컬럼의 데이터를 0,1,2로 변환한다.

df['species'].replace({'setosa':0,'versicolor':1,'virginica':2},inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['species'].replace({'setosa':0,'versicolor':1,'virginica':2},inplace=True)
  df['species'].replace({'setosa':0,'versicolor':1,'virginica':2},inplace=True)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [10]:
# 분석 데이터셋 준비
# X는 독립변수, y는 종속변수

X= df[['sepal_length','sepal_width','petal_length','petal_width']]
y=df["species"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=11)

In [11]:
# 주어진 데이터로 붓꽃 종류를 구분하는 문제이며, 분류를 위한 알고리즘 중에서 의사결정나무를 이용

dt= DecisionTreeClassifier(random_state=11)
dt.fit(X_train,y_train) # 학습수행

In [12]:
pred= dt.predict(X_test)

In [13]:
# 모델의 성능평가
from sklearn.metrics import accuracy_score
acc= accuracy_score(y_test,pred)
print(acc)

0.9333333333333333


In [14]:
# 모델 성능- 오차행렬
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred) # 30개의 데이터셋 중에서 28개만 정확하게 분류

array([[ 9,  0,  0],
       [ 0, 10,  0],
       [ 0,  2,  9]])

In [15]:
# 모델의 성능 평가- 평가지표계산
from sklearn.metrics import classification_report
rpt = classification_report(y_test,pred)
print(rpt)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       0.83      1.00      0.91        10
           2       1.00      0.82      0.90        11

    accuracy                           0.93        30
   macro avg       0.94      0.94      0.94        30
weighted avg       0.94      0.93      0.93        30



In [16]:
# 연관분석 알고리즘
# 연관분석은 하나의 거래나 사건에 포함된 항목간의 관련성을 파악하여 둘 이상의 항목들로 구성된 연관성 규칙을 도출한다.
import numpy as np
import pandas as pd

from mlxtend.frequent_patterns import apriori,association_rules

In [17]:
df= pd.read_csv('http://raw.githubusercontent.com/YoungjinBD/dataset/main/retail_dataset.csv',sep=',')

In [18]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,
...,...,...,...,...,...,...,...
310,Bread,Eggs,Cheese,,,,
311,Meat,Milk,Pencil,,,,
312,Bread,Cheese,Eggs,Meat,Pencil,Diaper,Wine
313,Meat,Cheese,,,,,


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315 entries, 0 to 314
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       315 non-null    object
 1   1       285 non-null    object
 2   2       245 non-null    object
 3   3       187 non-null    object
 4   4       133 non-null    object
 5   5       71 non-null     object
 6   6       41 non-null     object
dtypes: object(7)
memory usage: 17.4+ KB


In [30]:
# 전처리 수행
# 장바구니 데이터 고유항목 구분 출력
items=set()
for col in df:
  items.update(df[col].unique()) #  주로 데이터 분석에서 사용되는 함수로, 배열이나 데이터프레임에서 중복되지 않는 고유한 값들을 추출할 때 사용

# 장바구니 목록 값(텍스트)을 수치로 표현-각 항목당 매칭될 경우 1로, 아니면 0으로 표시(one-hot encoding)
itemset= set(items)
encoding=[]
for index,row in df.iterrows(): # 이 코드에서는 각 행의 인덱스와 값을 출력
  rowset = set(row)
  labels={}
  dismatching= list(itemset-rowset)
  matching= list(itemset.intersection(rowset)) # 공통요소를 찾을 경우 사용
  for i in dismatching:
    labels[i]=0
  for j in matching:
    labels[j]=1
  encoding.append(labels)
encoding[0]
result = pd.DataFrame(encoding)

result

Unnamed: 0,Bagel,Milk,NaN,Eggs,Bread,Diaper,Cheese,Wine,Meat,Pencil
0,0,0,0,1,1,1,1,1,1,1
1,0,1,0,0,1,1,1,1,1,1
2,0,1,1,1,0,0,1,1,1,0
3,0,1,1,1,0,0,1,1,1,0
4,0,0,1,0,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...
310,0,0,1,1,1,0,1,0,0,0
311,0,1,1,0,0,0,0,0,1,1
312,0,0,0,1,1,1,1,1,1,1
313,0,0,1,0,0,0,1,0,1,0


In [34]:
encoding[0]

{'Bagel': 0,
 'Milk': 0,
 nan: 0,
 'Eggs': 1,
 'Bread': 1,
 'Diaper': 1,
 'Cheese': 1,
 'Wine': 1,
 'Meat': 1,
 'Pencil': 1}

In [31]:
result= result.drop(result.columns[2],axis=1)
result

Unnamed: 0,Bagel,Milk,Eggs,Bread,Diaper,Cheese,Wine,Meat,Pencil
0,0,0,1,1,1,1,1,1,1
1,0,1,0,1,1,1,1,1,1
2,0,1,1,0,0,1,1,1,0
3,0,1,1,0,0,1,1,1,0
4,0,0,0,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...
310,0,0,1,1,0,1,0,0,0
311,0,1,0,0,0,0,0,1,1
312,0,0,1,1,1,1,1,1,1
313,0,0,0,0,0,1,0,1,0


In [35]:
# 분석시행
freq_items= apriori(result,min_support=0.2,use_colnames=True)

In [36]:
# association_rules로 규칙 도출(신뢰도 임계치 0.6 기반)
rules= association_rules(freq_items,metric='confidence',min_threshold=0.6)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265,0.402687
1,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
2,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
3,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203,0.469167
4,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624,0.387409
