In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
import konlpy
from konlpy.tag import Okt
from konlpy.tag import Mecab

In [3]:
# import os
# os.environ['JAVA_HOME'] = '/path/to/java/home'
# tokenizer = Okt()
tokenizer = Mecab('/opt/homebrew/lib/mecab/dic/mecab-ko-dic')

In [4]:
text = '함께 탐험하며 성장하는 AI 학교 AIFFEL'
tokenizer.morphs(text)

['함께', '탐험', '하', '며', '성장', '하', '는', 'AI', '학교', 'AIFFEL']

In [5]:
tokenizer.nouns(text)

['탐험', '성장', '학교']

In [6]:
tokenizer.pos(text)

[('함께', 'MAG'),
 ('탐험', 'NNG'),
 ('하', 'XSV'),
 ('며', 'EC'),
 ('성장', 'NNG'),
 ('하', 'XSV'),
 ('는', 'ETM'),
 ('AI', 'SL'),
 ('학교', 'NNG'),
 ('AIFFEL', 'SL')]

In [7]:
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [22]:
# 1. Tokenization
words = tokenizer.morphs(text)

In [23]:
# 2. Word Vectorization
vect.fit(words)

In [24]:
# 3. Return trained feature names
vect.get_feature_names_out()

array(['ai', 'aiffel', '성장', '탐험', '학교', '함께'], dtype=object)

In [25]:
# 4. Return trained vocabulary
vect.vocabulary_

{'함께': 5, '탐험': 3, '성장': 2, 'ai': 0, '학교': 4, 'aiffel': 1}

In [18]:
# 5. Transform words to vector
df_t = vect.transform(words)

In [19]:
df_t.toarray()

array([[0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0]])

In [None]:
pd.DataFrame(df_t.toarray(), columns=vect.get_feature_names_out())

In [None]:
test = 'AI 공부하며 함께 성장해요~'

In [None]:
words =tokenizer.morphs(test)
words

In [None]:
test_t = vect.transform(words)
test_t.toarray()

In [None]:
pd.DataFrame(test_t.toarray(), columns=vect.get_feature_names_out())

In [None]:
# TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
words = tokenizer.morphs(text)
vect.fit(words)
vect.vocabulary_

In [None]:
vect.transform(words).toarray()

In [None]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", sep="\t")

In [None]:
df

In [None]:
df.shape

In [None]:
df['label'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df['len'] = df['document'].apply(len)
df

In [None]:
import matplotlib.pyplot as plt
df[df.label == 0]['len'].plot(kind='hist', alpha=0.5, bins=50, label='0')

In [None]:
df[df.label == 1]['len'].plot(kind='hist', alpha=0.5, bins=50, label='0')

In [None]:
df = df[:1000]
df.shape

In [None]:
vect = CountVectorizer(tokenizer = tokenizer.morphs)
vectors = vect.fit_transform(df['document'])

In [None]:
from matplotlib.pylab import rand
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier(random_state=2022)
cross_val_score(model, vectors, df['label'], scoring='f1', cv=5).mean()

In [None]:

import konlpy
from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import pandas as pd

# 데이터 불러오기
df = pd.read_csv("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", sep="\t")
df = df[:1000]

# 형태소 분석기 지정
tokenizer = Mecab('/opt/homebrew/lib/mecab/dic/mecab-ko-dic')

# 결측치 제거
df.dropna(inplace=True)

# 문자열 백터화 (CountVectorizer)
vect = CountVectorizer(tokenizer = tokenizer.morphs)
vectors = vect.fit_transform(df['document'])

# 모델 정의 및 훈련
model = RandomForestClassifier(random_state=2022, n_jobs=-1)
cross_val_score(model, vectors, df['label'], scoring='f1', cv=5).mean()

In [None]:
!pip install git+https://github.com/jungin500/py-hanspell

In [None]:
vect = CountVectorizer(tokenizer = tokenizer.morphs, max_df = 10)
vectors = vect.fit_transform(df['document'])
model = RandomForestClassifier(random_state=2022, n_jobs=-1)
cross_val_score(model, vectors, df['label'], scoring='accuracy', cv=5).mean()

In [None]:
vect = CountVectorizer(tokenizer = tokenizer.morphs, max_df = 2)
vectors = vect.fit_transform(df['document'])
model = RandomForestClassifier(random_state=2022, n_jobs=-1)
cross_val_score(model, vectors, df['label'], scoring='accuracy', cv=5).mean()

In [None]:
text = '함께 탐험하며 성장하는 AI 학교 AIFFEL'
stop_words = ['하며', 'ai']
vect = CountVectorizer(stop_words = stop_words)
words = tokenizer.morphs(text)
vect.fit(words)
vect.vocabulary_

In [None]:
pip install git+https://github.com/haven-jeon/PyKoSpacing.git

In [None]:
from pykospacing import Spacing
spacing = Spacing()
test = '함께탐험하며성장하는AI학교AIFFEL'
spacing(test)

In [None]:
pip install soynlp

In [None]:
from soynlp.normalizer import *
emoticon_normalize('하하하하하하하핰ㅋㅋㅋㅋ호호호호호호홓ㅋㅋㅋㅋ', num_repeats=2)

In [None]:
pip install py-hanspell

In [None]:
!pip install git+https://github.com/ssut/py-hanspell.git

In [None]:
from hanspell import spell_checker
text = '사생활치매. 안핵갈려요. 뺑손이사고. 권투를 빈다. 설흔 즈음에. 문안한 스타일. 거북암이 들다. 마음이 절여온다. 골이따분한 성격. 노력이 숲으로 돌아가다. 일해라 절해라 하지 마세요.'
spell_checker.check(text)

In [26]:

import konlpy
from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import pandas as pd

# 데이터 불러오기
df = pd.read_csv("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", sep="\t")
df = df[:1000]

# 형태소 분석기 지정
tokenizer = Mecab('/opt/homebrew/lib/mecab/dic/mecab-ko-dic')

# 결측치 제거
df.dropna(inplace=True)

# 문자열 백터화 (CountVectorizer)
vect = CountVectorizer(tokenizer = tokenizer.morphs)
vectors = vect.fit_transform(df['document'])

# 모델 정의 및 훈련
model = RandomForestClassifier(random_state=2022, n_jobs=-1)
cross_val_score(model, vectors, df['label'], scoring='f1', cv=5).mean()

0.6989417989417989

In [27]:
model.fit(vectors, df['label'])

In [28]:
pred = model.predict(vectors)

In [30]:
from sklearn.metrics import confusion_matrix

# Assuming y_true is your true labels and y_pred is your predicted labels
y_true = df['label']
y_pred = pred

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)

print(cm)

[[508   0]
 [  0 492]]


In [36]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

import konlpy
from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import pandas as pd

# 데이터 불러오기
df = pd.read_csv("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", sep="\t")

# 형태소 분석기 지정
tokenizer = Mecab('/opt/homebrew/lib/mecab/dic/mecab-ko-dic')

# 결측치 제거
df.dropna(inplace=True)

# 문자열 백터화 (CountVectorizer)
vect = CountVectorizer(tokenizer = tokenizer.morphs)
vectors = vect.fit_transform(df['document'])

# Assuming vectors is your feature matrix and df['label'] is your labels
X = vectors
y = df['label']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2024)

# Create and train a logistic regression model
model = RandomForestClassifier(random_state=2024, n_jobs=-1)
model.fit(X_train, y_train)

# Predict labels for test set
y_pred = model.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

report = classification_report(y_test, y_pred)
print('\nClassification Report:')
print(report)

Accuracy: 0.8358630191782039

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84     22488
           1       0.85      0.82      0.83     22511

    accuracy                           0.84     44999
   macro avg       0.84      0.84      0.84     44999
weighted avg       0.84      0.84      0.84     44999



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['font.family'] = 'Avenir'

acc = []
f1 = []

for i in range(1, 31):
    vect = CountVectorizer(tokenizer = tokenizer.morphs, max_df = i)
    vectors = vect.fit_transform(df['document'])
    model = RandomForestClassifier(random_state=2022, n_jobs=-1)
    acc.append(cross_val_score(model, vectors, df['label'], scoring='accuracy', cv=5).mean())
    f1.append(cross_val_score(model, vectors, df['label'], scoring='f1', cv=5).mean())


fig, ax = plt.subplots(figsize=(13, 5))

sns.barplot(x = range(1, 31), y = acc, color='b', label='accuracy', ax=ax)
sns.lineplot(x = range(1, 31), y = f1, color='r', label='f1', ax=ax)

# Add labels for accuracy
for i, v in enumerate(acc):
    ax.text(i, v + 0.01, "{:.2f}".format(v), ha='center', va='bottom', fontsize=9)

# Add labels for f1 score
for i, v in enumerate(f1):
    ax.text(i, v + 0.01, "{:.2f}".format(v), ha='center', va='bottom', fontsize=9)

# Set y-axis limit
ax.set_ylim([0.2, None])

# Set legend position
ax.legend(loc='lower right')

In [None]:
df

In [None]:
type(vect)

In [None]:
shape = vectors.shape
shape