In [None]:
from google.colab import files

# 파일 업로드
uploaded = files.upload()  # train.csv, test.csv 파일 선택

In [2]:
import pandas as pd

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

print(train_data.head())
print(test_data.head())

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan


In [3]:
import re

# 텍스트 전처리 함수
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # URL 제거
    text = re.sub(r"@\w+", "", text)    # 멘션 제거
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # 특수문자 제거
    text = text.lower().strip()        # 소문자 변환 및 공백 제거
    return text

# train 데이터와 test 데이터의 텍스트 전처리
train_data['cleaned_text'] = train_data['text'].apply(clean_text)
test_data['cleaned_text'] = test_data['text'].apply(clean_text)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# TfidfVectorizer를 사용하여 텍스트를 수치화
vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(train_data['cleaned_text'])
y = train_data['target']

# 훈련 데이터와 검증 데이터로 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 모델 훈련
model = LogisticRegression()
model.fit(X_train, y_train)

# 검증 데이터 예측
val_predictions = model.predict(X_val)

# 정확도 확인
print("Validation Accuracy:", accuracy_score(y_val, val_predictions))
print(classification_report(y_val, val_predictions))


Validation Accuracy: 0.8063033486539725
              precision    recall  f1-score   support

           0       0.79      0.89      0.84       874
           1       0.83      0.69      0.75       649

    accuracy                           0.81      1523
   macro avg       0.81      0.79      0.80      1523
weighted avg       0.81      0.81      0.80      1523



In [6]:
# 테스트 데이터 변환 및 예측
X_test = vectorizer.transform(test_data['cleaned_text'])
test_predictions = model.predict(X_test)
