In [6]:
import pandas as pd

cols = ['id', 'text', 'label', 'intensity']
path = "./" 

anger_train = pd.read_csv(path + 'angertraindata.txt', header=None, sep='\t', names=cols, index_col=0)
fear_train = pd.read_csv(path + 'feartraindata.txt', header=None, sep='\t', names=cols, index_col=0)
joy_train = pd.read_csv(path + 'joytraindata.txt', header=None, sep='\t', names=cols, index_col=0)
sad_train = pd.read_csv(path + 'sadtraindata.txt', header=None, sep='\t', names=cols, index_col=0)

print("Anger Train Data:")
print(anger_train.head())
print("\n\n\n")

print("fear Train Data:")
print(fear_train.head())
print("\n\n\n")

print("joy Train Data:")
print(joy_train.head())
print("\n\n\n")

print("sad Train Data:")
print(sad_train.head())
print("\n\n\n")

Anger Train Data:
                                                    text  label  intensity
id                                                                        
10000  How the fu*k! Who the heck! moved my fridge!.....  anger      0.938
10001  So my Indian Uber driver just called someone t...  anger      0.896
10002  @DPD_UK I asked for my parcel to be delivered ...  anger      0.896
10003  so ef whichever butt wipe pulled the fire alar...  anger      0.896
10004  Don't join @BTCare they put the phone down on ...  anger      0.896




fear Train Data:
                                                    text label  intensity
id                                                                       
20000  I feel like I am drowning. #depression #anxiet...  fear      0.979
20001  I get so nervous even thinking about talking t...  fear      0.979
20002                     I lost my blinders .... #panic  fear      0.975
20003  I feel like I am drowning. #depression  #falur...  fear    

In [7]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer

tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[A-Za-z0-9./]+'
pat3 = r'[0-9]+'
combined_pat = r'|'.join((pat1, pat2, pat3))


stop_words = set(stopwords.words('english'))

def tweet_cleaner(text):
    stripped = re.sub(combined_pat, '', text)
    lower_case = stripped.lower()
    words = tok.tokenize(lower_case)
    filtered_words = [w for w in words if not w in stop_words]
    return ' '.join(filtered_words).strip()

anger_train['clean_text'] = anger_train['text'].apply(tweet_cleaner)
fear_train['clean_text'] = fear_train['text'].apply(tweet_cleaner)
joy_train['clean_text'] = joy_train['text'].apply(tweet_cleaner)
sad_train['clean_text'] = sad_train['text'].apply(tweet_cleaner)

print("Cleaned Anger Train Data:")
print(anger_train.head())
print("\n\n\n")

print("Cleaned Fear Train Data:")
print(fear_train.head())
print("\n\n\n")

print("Cleaned Joy Train Data:")
print(joy_train.head())
print("\n\n\n")

print("Cleaned Sad Train Data:")
print(sad_train.head())
print("\n\n\n")


Cleaned Anger Train Data:
                                                    text  label  intensity  \
id                                                                           
10000  How the fu*k! Who the heck! moved my fridge!.....  anger      0.938   
10001  So my Indian Uber driver just called someone t...  anger      0.896   
10002  @DPD_UK I asked for my parcel to be delivered ...  anger      0.896   
10003  so ef whichever butt wipe pulled the fire alar...  anger      0.896   
10004  Don't join @BTCare they put the phone down on ...  anger      0.896   

                                              clean_text  
id                                                        
10000  fu * k ! heck ! moved fridge !... knock landlo...  
10001  indian uber driver called someone n word . ' m...  
10002  asked parcel delivered pick store address # fu...  
10003  ef whichever butt wipe pulled fire alarm davis...  
10004  ' join put phone , talk rude . taking money ac...  




Cleaned Fe

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

anger_labels = pd.get_dummies(anger_train['label'])
fear_labels = pd.get_dummies(fear_train['label'])
joy_labels = pd.get_dummies(joy_train['label'])
sad_labels = pd.get_dummies(sad_train['label'])

vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 3))
X_BoW_anger = vectorizer.fit_transform(anger_train.clean_text)
X_BoW_anger = pd.DataFrame.sparse.from_spmatrix(X_BoW_anger).join(anger_labels)

X_BoW_fear = vectorizer.fit_transform(fear_train.clean_text)
X_BoW_fear = pd.DataFrame.sparse.from_spmatrix(X_BoW_fear).join(fear_labels)

X_BoW_joy = vectorizer.fit_transform(joy_train.clean_text)
X_BoW_joy = pd.DataFrame.sparse.from_spmatrix(X_BoW_joy).join(joy_labels)

X_BoW_sad = vectorizer.fit_transform(sad_train.clean_text)
X_BoW_sad = pd.DataFrame.sparse.from_spmatrix(X_BoW_sad).join(sad_labels)

vectorizer_tfidf = TfidfVectorizer(max_features=1000)
X_tfidf_anger = vectorizer_tfidf.fit_transform(anger_train.clean_text)
X_tfidf_anger = pd.DataFrame.sparse.from_spmatrix(X_tfidf_anger).join(anger_labels)

X_tfidf_fear = vectorizer_tfidf.fit_transform(fear_train.clean_text)
X_tfidf_fear = pd.DataFrame.sparse.from_spmatrix(X_tfidf_fear).join(fear_labels)

X_tfidf_joy = vectorizer_tfidf.fit_transform(joy_train.clean_text)
X_tfidf_joy = pd.DataFrame.sparse.from_spmatrix(X_tfidf_joy).join(joy_labels)

X_tfidf_sad = vectorizer_tfidf.fit_transform(sad_train.clean_text)
X_tfidf_sad = pd.DataFrame.sparse.from_spmatrix(X_tfidf_sad).join(sad_labels)

print("BoW Features for Anger Train Data:")
print(X_BoW_anger.head())
print("\n\n\n")

print("TF-IDF Features for Anger Train Data:")
print(X_tfidf_anger.head())
print("\n\n\n")

print("BoW Features for Fear Train Data:")
print(X_BoW_fear.head())
print("\n\n\n")

print("TF-IDF Features for Fear Train Data:")
print(X_tfidf_fear.head())
print("\n\n\n")

print("BoW Features for Joy Train Data:")
print(X_BoW_joy.head())
print("\n\n\n")

print("TF-IDF Features for Joy Train Data:")
print(X_tfidf_joy.head())
print("\n\n\n")

print("BoW Features for Sad Train Data:")
print(X_BoW_sad.head())
print("\n\n\n")

print("TF-IDF Features for Sad Train Data:")
print(X_tfidf_sad.head())
print("\n\n\n")


BoW Features for Anger Train Data:
   0  1  2  3  4  5  6  7  8  9  ...  991  992  993  994  995  996  997  998  \
0  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0    0   
1  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0    0   
2  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0    0   
3  0  0  0  0  0  0  0  0  0  1  ...    0    0    0    0    0    0    0    0   
4  0  1  0  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0    0   

   999  anger  
0    0    NaN  
1    0    NaN  
2    0    NaN  
3    0    NaN  
4    0    NaN  

[5 rows x 1001 columns]




TF-IDF Features for Anger Train Data:
   0  1  2  3  4  5         6  7  8  9  ...  991  992  993  994  995  996  \
0  0  0  0  0  0  0         0  0  0  0  ...    0    0    0    0    0    0   
1  0  0  0  0  0  0         0  0  0  0  ...    0    0    0    0    0    0   
2  0  0  0  0  0  0         0  0  0  0  ...    0    0    0    0    0    0   
3  0  0  0  0

In [12]:
joy_dev['clean_text'] = joy_dev['text'].apply(tweet_cleaner)


NameError: name 'joy_dev' is not defined