In [1]:
max_length = 256 # sms(가사) 최대 길이

# 1. 데이터 불러오기

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('sms.tsv', sep='\t',)
print(df.columns)
print(df.shape)

Index(['label', 'sms'], dtype='object')
(5572, 2)


In [4]:
df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# 클래스 파악
classes = sorted(set(df['label']))
class_to_idx = {}

for i, c in enumerate(classes): # 모든 클래스에 대해
    class_to_idx.update({c: i})
    
nclass = len(classes)

print("# of classes: %d" %nclass)
print(classes)
print(class_to_idx)

# of classes: 2
['ham', 'spam']
{'ham': 0, 'spam': 1}


# 2. 새로운 DataFrame

## 1) 'label, sms'만 남기기

## 2) 최대 텍스트 길이 만큼 자르기 # pandas.Series.str.slice

* '성별, 가사'만 남기려면?

In [6]:
new_df = pd.DataFrame({'label':df['label'],
                      'sms':df['sms'].str.slice( # 최대 가사 텍스트 만큼 자르기
                          start=0, stop = max_length)
                      })

## 3) 중복 제거

In [7]:
len(new_df)

5572

In [8]:
new_df = pd.DataFrame( new_df.drop_duplicates() )

In [9]:
len(new_df)

5169

## 4) 셔플

In [10]:
df_shuffled=new_df.sample(frac=1).reset_index(drop=True)
df_shuffled.head()

Unnamed: 0,label,sms
0,spam,Knock Knock Txt whose there to 80082 to enter ...
1,ham,Yup but it's not giving me problems now so may...
2,ham,Sos! Any amount i can get pls.
3,ham,"Sorry, left phone upstairs. OK, might be hecti..."
4,ham,"K, I might come by tonight then if my class le..."


## 5) train, test 나누기

In [11]:
# train: test = 9:1
# train: test = 540: 60 -> train:valid:test = 432:108:60
train_ratio = 0.9

#train dataset
s, e = 0, int(df_shuffled.shape[0] * train_ratio) # # of rows
df_train = pd.DataFrame({'label': df_shuffled['label'][s:e],
                        'sms':df_shuffled['sms'][s:e]})
print("index for train: %d~%d" %(s,e))

#test dataset
s, e = e, e+int(df_shuffled.shape[0] * (1.0 - train_ratio)) # # of rows
print("index for test: %d~%d" %(s,e))
df_test = pd.DataFrame({'label': df_shuffled['label'][s:e],
                        'sms':df_shuffled['sms'][s:e]})

index for train: 0~4652
index for test: 4652~5168


In [12]:
# column 수 확인
print(df_train.shape)
print(df_test.shape)

(4652, 2)
(516, 2)


## 4) 저장

In [13]:
# new_df.columns : ['성별', '가사']
df_train.to_csv('./sms.maxlen.uniq.shuf.train.tsv',
               header = False, index=False, sep='\t')

df_test.to_csv('./sms.maxlen.uniq.shuf.test.tsv',
               header = False, index=False, sep='\t')