# Library

In [1]:
import pandas as pd1
import pandas as pd
data=pd.read_csv('polarity.csv')

import nltk
stopwords = nltk.corpus.stopwords.words('english')
import string
import re
from nltk.stem import PorterStemmer
ps = PorterStemmer()
import numpy as np

# Text Cleaning - Removing Punctuations

In [2]:
def remove_punctuation(txt):
    txt_nopunct = "".join([c for c in txt if c not in string.punctuation])
    return txt_nopunct

In [3]:
data['msg_clean'] = data['document'].apply(lambda x: remove_punctuation(x)) 
data.head()

Unnamed: 0,class_label,document,msg_clean
0,neg,"tristar / 1 : 30 / 1997 / r ( language , viole...",tristar 1 30 1997 r language violence d...
1,neg,arlington road * 1/4 . directed by mark pellin...,arlington road 14 directed by mark pellingto...
2,neg,the brady bunch movie is less a motion picture...,the brady bunch movie is less a motion picture...
3,neg,janeane garofalo in a romantic comedy -- it wa...,janeane garofalo in a romantic comedy it was ...
4,neg,"i'm going to keep this plot summary brief , so...",im going to keep this plot summary brief some...


# Text Cleaning - Tokenizing Non-Words

In [4]:
def tokenize(txt):
    tokens = re.split('\W+',txt)
    return tokens

In [5]:
data['msg_clean_tokenized']  = data['msg_clean'].apply(lambda x: tokenize(x.lower()))
data.head()

Unnamed: 0,class_label,document,msg_clean,msg_clean_tokenized
0,neg,"tristar / 1 : 30 / 1997 / r ( language , viole...",tristar 1 30 1997 r language violence d...,"[tristar, 1, 30, 1997, r, language, violence, ..."
1,neg,arlington road * 1/4 . directed by mark pellin...,arlington road 14 directed by mark pellingto...,"[arlington, road, 14, directed, by, mark, pell..."
2,neg,the brady bunch movie is less a motion picture...,the brady bunch movie is less a motion picture...,"[the, brady, bunch, movie, is, less, a, motion..."
3,neg,janeane garofalo in a romantic comedy -- it wa...,janeane garofalo in a romantic comedy it was ...,"[janeane, garofalo, in, a, romantic, comedy, i..."
4,neg,"i'm going to keep this plot summary brief , so...",im going to keep this plot summary brief some...,"[im, going, to, keep, this, plot, summary, bri..."


# Text Cleaning - Removing Stop Words

In [6]:
def remove_stopwords(txt_tokenized):
    txt_clean = [word for word in txt_tokenized if word not in stopwords]
    return txt_clean

In [7]:
data['msg_no_stopwords']  = data['msg_clean_tokenized'].apply(lambda x: remove_stopwords(x))
data.tail()

Unnamed: 0,class_label,document,msg_clean,msg_clean_tokenized,msg_no_stopwords
1395,pos,one of the last entries in the long-running ca...,one of the last entries in the longrunning car...,"[one, of, the, last, entries, in, the, longrun...","[one, last, entries, longrunning, carry, serie..."
1396,pos,"hype ? sheesh , like no other . this side of t...",hype sheesh like no other this side of tita...,"[hype, sheesh, like, no, other, this, side, of...","[hype, sheesh, like, side, titanic, good, hunt..."
1397,pos,for those of us who weren't yet born when the ...,for those of us who werent yet born when the 1...,"[for, those, of, us, who, werent, yet, born, w...","[us, werent, yet, born, 1960s, rock, n, rolled..."
1398,pos,what starts out as a monotonous talking-head m...,what starts out as a monotonous talkinghead mu...,"[what, starts, out, as, a, monotonous, talking...","[starts, monotonous, talkinghead, musical, his..."
1399,pos,jackie brown ( miramax - 1997 ) starring pam g...,jackie brown miramax 1997 starring pam grie...,"[jackie, brown, miramax, 1997, starring, pam, ...","[jackie, brown, miramax, 1997, starring, pam, ..."


In [8]:
#Text Cleaning - Stemming(Porter Stemming)

In [9]:
def stemming(tokenize_text):
    text = [ps.stem(word) for word in tokenize_text]
    return text

In [10]:
data['msg_stemmed'] = data['msg_no_stopwords'].apply(lambda x: stemming(x))
data.head()

Unnamed: 0,class_label,document,msg_clean,msg_clean_tokenized,msg_no_stopwords,msg_stemmed
0,neg,"tristar / 1 : 30 / 1997 / r ( language , viole...",tristar 1 30 1997 r language violence d...,"[tristar, 1, 30, 1997, r, language, violence, ...","[tristar, 1, 30, 1997, r, language, violence, ...","[tristar, 1, 30, 1997, r, languag, violenc, de..."
1,neg,arlington road * 1/4 . directed by mark pellin...,arlington road 14 directed by mark pellingto...,"[arlington, road, 14, directed, by, mark, pell...","[arlington, road, 14, directed, mark, pellingt...","[arlington, road, 14, direct, mark, pellington..."
2,neg,the brady bunch movie is less a motion picture...,the brady bunch movie is less a motion picture...,"[the, brady, bunch, movie, is, less, a, motion...","[brady, bunch, movie, less, motion, picture, m...","[bradi, bunch, movi, less, motion, pictur, min..."
3,neg,janeane garofalo in a romantic comedy -- it wa...,janeane garofalo in a romantic comedy it was ...,"[janeane, garofalo, in, a, romantic, comedy, i...","[janeane, garofalo, romantic, comedy, good, id...","[janean, garofalo, romant, comedi, good, idea,..."
4,neg,"i'm going to keep this plot summary brief , so...",im going to keep this plot summary brief some...,"[im, going, to, keep, this, plot, summary, bri...","[im, going, keep, plot, summary, brief, someth...","[im, go, keep, plot, summari, brief, someth, w..."



# Text Cleaning (combined)

In [11]:
def clean_text(txt):
    txt = "".join([c for c in txt if c not in string.punctuation])
    tokens = re.split('\W+',txt)
    txt = [ps.stem(word) for word in tokens if word not in stopwords]
    return txt
    

# TFIDF Vectorizer

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X = tfidf_vect.fit_transform(data['document'])
print(X.shape)




(1400, 28635)


In [13]:
df = pd1.DataFrame(X.toarray(), columns=tfidf_vect.get_feature_names())
df
    

Unnamed: 0,Unnamed: 1,0,00,000,000foot,007,007esqu,0100,0195714,0195714a,...,â,élan,élodi,étienn,éva,één,œf¹,œnet,œwe,š
0,0.008420,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.006283,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.015501,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.009261,0.0,0.0,0.035983,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.005248,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,0.010165,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1396,0.016414,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1397,0.012523,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1398,0.009627,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
pd1.set_option('display.max_columns',1000)
pd1.set_option('display.max_rows',1000)


# Computing TF

In [None]:
documentA = "the man went out for a walk"
documentB = "the children sat around the fire"

In [None]:
bagOfWordsA = documentA.split(" ")
bagOfWordsB = documentB.split(" ")

In [None]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [None]:
print(uniqueWords)

In [None]:
numOfWordsA = dict.fromkeys(uniqueWords,0)
for words in bagOfWordsA:
    numOfWordsA[words] += 1
    numOfWordsA[words] = numOfWordsA[words] / len(documentA.split(" "))

 
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for words in bagOfWordsB:
    numOfWordsB[words] += 1

In [None]:
print(bagOfWordsA)

print(numOfWordsA)
#print(numOfWordsB)

# Total number of documents per class

In [None]:
#There are 8 classess

classFrequency = []
classLabels = ['neg','pos']
for c in classLabels:
    nTotalDocumentPerClass = 0 
    for class_label in data['class_label']:
        if c == class_label: nTotalDocumentPerClass +=1
            
            
    classFrequency.append(nTotalDocumentPerClass)
                  
print(classFrequency)

In [None]:
# Total number of documents per class per term
features = dict.fromkeys(tfidf_vect.get_feature_names(),0)

arr_features = tfidf_vect.get_feature_names()


classLabels = ['neg','pos']


df = data['document']


    
for i in range(len(arr_features)):
    for term in df.loc[i].split(" "):
        if arr_features[i] == term:
            print(arr_features[i])

#print(arr_features)
    
 
  



# BagOfWords in Reuters21578

In [None]:
#print(tfidf_vect.get_feature_names())



# MONO

In [None]:
#Get Document frequency per class
alpha = 7.0
listOfMaxOccurence = []
listOfNonOccurence = []
    


# 1. Get all unique terms

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(analyzer=clean_text)
vect.fit(data['document'])
X = vect.transform(data['document'])

In [15]:
features = vect.get_feature_names()

In [16]:
len(features)

28635

# 2. Loop in every document

In [17]:
 rowdata = data['document'].loc[0].split(" ")
len(rowdata)

659

# 3.Search uniqe terms in every word in document if found print class label

In [18]:
class_label = []
for i in range(len(features)-1):
    counter = 0
    for term in data['document'].loc[0].split(" "):
        if features[i] == term:
            print(data['class_label'])
            break


0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    p

0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    p

0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    p

Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1395    pos
1396    pos
1397    pos
1398    pos
1399    pos
Name: class_label, Length: 1400, dtype: object
0       neg
1       neg
2       neg
3       neg
4       ne

# DF of each terms in each CLASS LABEL from the Unique terms

In [19]:
import csv
f = open("DF_Class_Label_ALL_values1","a",newline="")
writer = csv.writer(f)


In [20]:
import numpy as np
df_values = np.zeros([7708,7], dtype="int32")
np.set_printoptions(threshold=np. inf)
df_values

array([[0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0,

In [None]:
#limit = 7215 original number of rows
limit = 7708
# 10,000 parse in 4 minutes
# 29,523 unique features using CountVetorizer 
dft = {}
for indx, feature in enumerate(features):
    has_term = data["document"].str.lower().str.contains(feature)
    res = pd.DataFrame(data[has_term])
    
    classLabel = res['class_label'].value_counts().keys().tolist()
    classDf = res['class_label'].value_counts().tolist()
      
        
    dft[feature] = classLabel + classDf
    #df_values[indx] = classDf


  
    #writer.writerow(classDf)
    if indx == limit:
        break
dft


In [None]:
np.set_printoptions(threshold=np. inf)
print(df_values)

In [None]:
len(dft.get('000'))

In [None]:
filedata = np.genfromtxt('reuters_document_frequency.txt', delimiter=",")
filedata = filedata.astype('int32') # to typecast into integer


filedata

In [None]:
data.class_label.value_counts()

In [None]:
pd.DataFrame.from_dict(dft,orient='columns').T

#x = pd.DataFrame(dft)

### Document frequency in all feature - Revise computation

In [None]:
#limit = 7215 original number of rows
limit = 8324
# 10,000 parse in 4 minutes
# 29,523 unique features using CountVetorizer 
dft = {}
for indx, feature in enumerate(features):
    has_term = data["document"].str.lower().str.contains(feature)
    res = pd.DataFrame(data[has_term])
    
    classLabel = res['class_label'].value_counts().keys().tolist()
    classDf = res['class_label'].value_counts().tolist()
      
        
    dft[feature] = classLabel + classDf
    #df_values[indx] = classDf


  
    writer.writerow(classDf)
    if indx == limit:
        break
dft