In [1]:
from glob import glob #Folders directories
import pandas as pd #Data Manipulation
import numpy as np #Linear algebra
import nltk #Natural Language Processing
from nltk.tokenize import word_tokenize #Used to tokenize words
from nltk.corpus import stopwords #Used to remove stop words
from nltk.stem import PorterStemmer ##Used to perform stemming
import re #Regulars expressions

### Class for preprocessing stage

In [2]:
class preprocessing(object):
    def __init__(self, punct):
        #we define our punctuation signs to then remove them
        self.punct = punct 
        
    def cleaning(self, string):
        #Removing punctuation signs
        string = "".join([i for i in string if i not in self.punct])
        #Most of text have literally \n or \\n and we want to delete them
        string = string.replace("\\n", " ").replace("\n", " ")
        #Most of text have \\ or // literally among the words  and we want to delete them
        string = string.replace("\\", " ").replace("//", " ")
        #Most of text have -- or -- literally among the words  and we want to delete them
        string = string.replace("-", " ").replace("--", " ")
        #Taking only letters no numbers
        string = re.sub(r'\d+', '', string)
        string = ' '.join(string.split())
        #Tokenizing and lowering text
        string = word_tokenize(string.lower())
        #Removing stopwords
        stop_words = stopwords.words('english')
        cleaned_words = [w for w in string if w not in stop_words]
        stemmer = PorterStemmer()
        #Stemming 
        stemmed_words = [stemmer.stem(w) for w in cleaned_words]
        stem_words = " ".join(stemmed_words)
        return stem_words
        
        #Putting the cleaning method into a new column called clean text
    def addtoframe(selfdev, df):
        df['clean_text'] = df.body.apply(selfdev.cleaning)
        
        #Getting tags of words text
    def get_pos(self, string):
        string = nltk.word_tokenize(string)
        pos_string = nltk.pos_tag(string)
        return pos_string
        
        #Putting the tagging method into a new column called postag_text
    def postoframe(selfdev, df):
        df['postag_text'] = df.clean_text.apply(selfdev.get_pos)

In [3]:
#With the help of glob, we can read all jsons and put them into a single dataframe
jsons = sorted(glob('*.json'))
df = pd.concat((pd.read_json(file) for file in jsons),ignore_index=True)

In [4]:
#Shape of the dataframe
df.shape

(21578, 7)

In [5]:
#Types of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21578 entries, 0 to 21577
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          20841 non-null  object
 1   body           19043 non-null  object
 2   date           21578 non-null  object
 3   topics         11367 non-null  object
 4   places         18798 non-null  object
 5   id             21578 non-null  int64 
 6   organisations  881 non-null    object
dtypes: int64(1), object(6)
memory usage: 1.2+ MB


In [6]:
#Checking null values
df.isnull().sum()

title              737
body              2535
date                 0
topics           10211
places            2780
id                   0
organisations    20697
dtype: int64

In [7]:
#Saving the shape of the rows before deleting null values
before = df.shape[0]

In [8]:
#Replacing null values with unknown word
df['title'] = df['title'].replace(np.nan,'unknown')
df['places'] = df['places'].replace(np.nan,'unknown')
df['organisations'] = df['organisations'].replace(np.nan,'unknown')

In [9]:
#Dropping null values
df = df.dropna()

In [10]:
##Saving the shape of the rows after deleting null values
after = df.shape[0]

In [11]:
after

10377

In [12]:
#Percetage of data we have left from the dataset
percentage = round((after/before)*100,2)
print("The percentage left of data is: ",percentage,"%")

The percentage left of data is:  48.09 %


In [13]:
#Checking there is not null values again
df.isnull().sum()

title            0
body             0
date             0
topics           0
places           0
id               0
organisations    0
dtype: int64

In [14]:
#Putting body column as string to better mnipulation
df['body'] = df['body'].astype(str)

In [15]:
#Calling our class with our punctuation signs to remove them
cleaning = preprocessing('\â€œ:â€&;?!"#$%&\'()*+,-./:;<=>?[\\]^_`{|}~¡¬´')

In [16]:
#Calling method of the class to put in the dataframe
preprocessing = cleaning.addtoframe(df)

In [17]:
df.columns

Index(['title', 'body', 'date', 'topics', 'places', 'id', 'organisations',
       'clean_text'],
      dtype='object')

In [18]:
#Watching dataframe
df

Unnamed: 0,title,body,date,topics,places,id,organisations,clean_text
0,BAHIA COCOA REVIEW,Showers continued throughout the week in\nthe ...,1987-02-26 15:01:01.790000,[cocoa],"[el-salvador, usa, uruguay]",1,unknown,shower continu throughout week bahia cocoa zon...
4,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE,The U.S. Agriculture Department\nreported the ...,1987-02-26 15:10:44.600000,"[grain, wheat, corn, barley, oat, sorghum]",[usa],5,unknown,us agricultur depart report farmerown reserv n...
5,ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS,Argentine grain board figures show\ncrop regis...,1987-02-26 15:14:36.410000,"[veg-oil, linseed, lin-oil, soy-oil, sun-oil, ...",[argentina],6,unknown,argentin grain board figur show crop registr g...
8,CHAMPION PRODUCTS <CH> APPROVES STOCK SPLIT,Champion Products Inc said its\nboard of direc...,1987-02-26 15:17:11.200000,[earn],[usa],9,unknown,champion product inc said board director appro...
9,COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SALE,Computer Terminal Systems Inc said\nit has com...,1987-02-26 15:18:06.670000,[acq],[usa],10,unknown,comput termin system inc said complet sale sha...
...,...,...,...,...,...,...,...,...
21570,N.Z.'S CHASE CORP MAKES OFFER FOR ENTREGROWTH,Chase Corp Ltd <CHCA.WE> said it will\nmake an...,1987-10-19 01:35:27.640000,[acq],[new-zealand],21571,unknown,chase corp ltd chcaw said make offer fullypaid...
21572,TOKYO DEALERS SEE DOLLAR POISED TO BREACH 140 YEN,Tokyo's foreign exchange market is watching\nn...,1987-10-19 00:59:58.560000,"[money-fx, dlr, yen]","[japan, west-germany, usa]",21573,unknown,tokyo foreign exchang market watch nervous see...
21573,JAPAN/INDIA CONFERENCE CUTS GULF WAR RISK CHARGES,The Japan/India-Pakistan-Gulf/Japan\nshipping ...,1987-10-19 00:34:08.940000,[ship],"[hong-kong, japan, india, pakistan, iran, iraq]",21574,unknown,japanindiapakistangulfjapan ship confer said w...
21574,SOVIET INDUSTRIAL GROWTH/TRADE SLOWER IN 1987,The Soviet Union's industrial output is\ngrowi...,1987-10-19 00:18:22.790000,[ipi],[ussr],21575,unknown,soviet union industri output grow slower pace ...


In [19]:
#Putting the postag on a column
cleaning.postoframe(df)

In [20]:
#Checking datafrme
df

Unnamed: 0,title,body,date,topics,places,id,organisations,clean_text,postag_text
0,BAHIA COCOA REVIEW,Showers continued throughout the week in\nthe ...,1987-02-26 15:01:01.790000,[cocoa],"[el-salvador, usa, uruguay]",1,unknown,shower continu throughout week bahia cocoa zon...,"[(shower, NN), (continu, NN), (throughout, IN)..."
4,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE,The U.S. Agriculture Department\nreported the ...,1987-02-26 15:10:44.600000,"[grain, wheat, corn, barley, oat, sorghum]",[usa],5,unknown,us agricultur depart report farmerown reserv n...,"[(us, PRP), (agricultur, VBP), (depart, JJ), (..."
5,ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS,Argentine grain board figures show\ncrop regis...,1987-02-26 15:14:36.410000,"[veg-oil, linseed, lin-oil, soy-oil, sun-oil, ...",[argentina],6,unknown,argentin grain board figur show crop registr g...,"[(argentin, NN), (grain, NN), (board, NN), (fi..."
8,CHAMPION PRODUCTS <CH> APPROVES STOCK SPLIT,Champion Products Inc said its\nboard of direc...,1987-02-26 15:17:11.200000,[earn],[usa],9,unknown,champion product inc said board director appro...,"[(champion, NN), (product, NN), (inc, NN), (sa..."
9,COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SALE,Computer Terminal Systems Inc said\nit has com...,1987-02-26 15:18:06.670000,[acq],[usa],10,unknown,comput termin system inc said complet sale sha...,"[(comput, NN), (termin, NN), (system, NN), (in..."
...,...,...,...,...,...,...,...,...,...
21570,N.Z.'S CHASE CORP MAKES OFFER FOR ENTREGROWTH,Chase Corp Ltd <CHCA.WE> said it will\nmake an...,1987-10-19 01:35:27.640000,[acq],[new-zealand],21571,unknown,chase corp ltd chcaw said make offer fullypaid...,"[(chase, NN), (corp, NN), (ltd, NN), (chcaw, N..."
21572,TOKYO DEALERS SEE DOLLAR POISED TO BREACH 140 YEN,Tokyo's foreign exchange market is watching\nn...,1987-10-19 00:59:58.560000,"[money-fx, dlr, yen]","[japan, west-germany, usa]",21573,unknown,tokyo foreign exchang market watch nervous see...,"[(tokyo, JJ), (foreign, JJ), (exchang, NN), (m..."
21573,JAPAN/INDIA CONFERENCE CUTS GULF WAR RISK CHARGES,The Japan/India-Pakistan-Gulf/Japan\nshipping ...,1987-10-19 00:34:08.940000,[ship],"[hong-kong, japan, india, pakistan, iran, iraq]",21574,unknown,japanindiapakistangulfjapan ship confer said w...,"[(japanindiapakistangulfjapan, NN), (ship, NN)..."
21574,SOVIET INDUSTRIAL GROWTH/TRADE SLOWER IN 1987,The Soviet Union's industrial output is\ngrowi...,1987-10-19 00:18:22.790000,[ipi],[ussr],21575,unknown,soviet union industri output grow slower pace ...,"[(soviet, JJ), (union, NN), (industri, NN), (o..."


## Checking the topics classes for the classification

In [21]:
#There are more than three types of topics, therefore we will need a multi-classifier
df.topics

0                                                  [cocoa]
4               [grain, wheat, corn, barley, oat, sorghum]
5        [veg-oil, linseed, lin-oil, soy-oil, sun-oil, ...
8                                                   [earn]
9                                                    [acq]
                               ...                        
21570                                                [acq]
21572                                 [money-fx, dlr, yen]
21573                                               [ship]
21574                                                [ipi]
21575                                               [gold]
Name: topics, Length: 10377, dtype: object

In [22]:
#Putting topics column as string to better manipulation
df['topics'] = df['topics'].astype(str)

In [23]:
#Saving number of different classes in the topics columns
topics_number = df['topics'].value_counts()

In [24]:
len(topics_number)

646

In [25]:
#Principal six top classes 
topics_number.head(6)

['earn']        3735
['acq']         2125
['crude']        355
['trade']        333
['money-fx']     259
['interest']     211
Name: topics, dtype: int64

## Choosing the 6 most appeared topics to the classification

In [26]:
earns = df[df['topics'] == "['earn']"]
acqs = df[df['topics'] == "['acq']"]
crudes = df[df['topics'] == "['crude']"]
trades = df[df['topics'] == "['trade']"]
moneyfxs = df[df['topics'] == "['money-fx']"]
interests = df[df['topics'] == "['interest']"]

In [27]:
#Merging all those six classes into one
little = pd.concat([earns, acqs, crudes, trades, moneyfxs, interests]).reset_index(drop=True)

In [28]:
little

Unnamed: 0,title,body,date,topics,places,id,organisations,clean_text,postag_text
0,CHAMPION PRODUCTS <CH> APPROVES STOCK SPLIT,Champion Products Inc said its\nboard of direc...,1987-02-26 15:17:11.200000,['earn'],[usa],9,unknown,champion product inc said board director appro...,"[(champion, NN), (product, NN), (inc, NN), (sa..."
1,COBANCO INC <CBCO> YEAR NET,"Shr 34 cts vs 1.19 dlrs\n Net 807,000 vs 2,...",1987-02-26 15:18:59.340000,['earn'],[usa],11,unknown,shr ct vs dlr net vs asset mln vs mln deposit ...,"[(shr, NN), (ct, NN), (vs, JJ), (dlr, JJ), (ne..."
2,AM INTERNATIONAL INC <AM> 2ND QTR JAN 31,Oper shr loss two cts vs profit seven cts\n ...,1987-02-26 15:20:13.090000,['earn'],[usa],13,unknown,oper shr loss two ct vs profit seven ct oper s...,"[(oper, IN), (shr, JJ), (loss, NN), (two, CD),..."
3,BROWN-FORMAN INC <BFD> 4TH QTR NET,Shr one dlr vs 73 cts\n Net 12.6 mln vs 15....,1987-02-26 15:20:27.170000,['earn'],[usa],14,unknown,shr one dlr vs ct net mln vs mln rev mln vs ml...,"[(shr, VB), (one, CD), (dlr, NN), (vs, NN), (c..."
4,DEAN FOODS <DF> SEES STRONG 4TH QTR EARNINGS,Dean Foods Co expects earnings for the\nfourth...,1987-02-26 15:26:26.780000,['earn'],[usa],18,unknown,dean food co expect earn fourth quarter end ma...,"[(dean, NN), (food, NN), (co, NN), (expect, VB..."
...,...,...,...,...,...,...,...,...,...
7013,LAWSON SAYS UK INTEREST RATE PROSPECTS UNCHANGED,U.K. Chancellor of the Exchequer Nigel\nLawson...,1987-10-20 14:16:22.140000,['interest'],[uk],20532,unknown,uk chancellor exchequ nigel lawson said collap...,"[(uk, JJ), (chancellor, NN), (exchequ, NN), (n..."
7014,REAGAN SEES ROOM FOR INTEREST RATE DECLINES,President Reagan said he was pleased\nwith the...,1987-10-20 17:16:07.650000,['interest'],[usa],20769,unknown,presid reagan said pleas action bank reduc pri...,"[(presid, NN), (reagan, NN), (said, VBD), (ple..."
7015,FED ARRANGES THREE-DAY SYSTEM REPOS,The Federal Reserve entered the\ngovernment se...,1987-10-19 11:55:16.350000,['interest'],[usa],21285,unknown,feder reserv enter govern secur market arrang ...,"[(feder, NN), (reserv, NN), (enter, NN), (gove..."
7016,FDIC'S SEIDMAN SAYS HIGHER RATES COULD HARM BANKS,Federal Deposit Insurance Corp Chairman\nWilli...,1987-10-19 11:09:59.740000,['interest'],[usa],21342,unknown,feder deposit insur corp chairman william seid...,"[(feder, NN), (deposit, NN), (insur, VBP), (co..."


In [29]:
#Removing the brackets to better manipulation
little['topics_clean'] = little['topics'].str.strip('[]')

In [30]:
little

Unnamed: 0,title,body,date,topics,places,id,organisations,clean_text,postag_text,topics_clean
0,CHAMPION PRODUCTS <CH> APPROVES STOCK SPLIT,Champion Products Inc said its\nboard of direc...,1987-02-26 15:17:11.200000,['earn'],[usa],9,unknown,champion product inc said board director appro...,"[(champion, NN), (product, NN), (inc, NN), (sa...",'earn'
1,COBANCO INC <CBCO> YEAR NET,"Shr 34 cts vs 1.19 dlrs\n Net 807,000 vs 2,...",1987-02-26 15:18:59.340000,['earn'],[usa],11,unknown,shr ct vs dlr net vs asset mln vs mln deposit ...,"[(shr, NN), (ct, NN), (vs, JJ), (dlr, JJ), (ne...",'earn'
2,AM INTERNATIONAL INC <AM> 2ND QTR JAN 31,Oper shr loss two cts vs profit seven cts\n ...,1987-02-26 15:20:13.090000,['earn'],[usa],13,unknown,oper shr loss two ct vs profit seven ct oper s...,"[(oper, IN), (shr, JJ), (loss, NN), (two, CD),...",'earn'
3,BROWN-FORMAN INC <BFD> 4TH QTR NET,Shr one dlr vs 73 cts\n Net 12.6 mln vs 15....,1987-02-26 15:20:27.170000,['earn'],[usa],14,unknown,shr one dlr vs ct net mln vs mln rev mln vs ml...,"[(shr, VB), (one, CD), (dlr, NN), (vs, NN), (c...",'earn'
4,DEAN FOODS <DF> SEES STRONG 4TH QTR EARNINGS,Dean Foods Co expects earnings for the\nfourth...,1987-02-26 15:26:26.780000,['earn'],[usa],18,unknown,dean food co expect earn fourth quarter end ma...,"[(dean, NN), (food, NN), (co, NN), (expect, VB...",'earn'
...,...,...,...,...,...,...,...,...,...,...
7013,LAWSON SAYS UK INTEREST RATE PROSPECTS UNCHANGED,U.K. Chancellor of the Exchequer Nigel\nLawson...,1987-10-20 14:16:22.140000,['interest'],[uk],20532,unknown,uk chancellor exchequ nigel lawson said collap...,"[(uk, JJ), (chancellor, NN), (exchequ, NN), (n...",'interest'
7014,REAGAN SEES ROOM FOR INTEREST RATE DECLINES,President Reagan said he was pleased\nwith the...,1987-10-20 17:16:07.650000,['interest'],[usa],20769,unknown,presid reagan said pleas action bank reduc pri...,"[(presid, NN), (reagan, NN), (said, VBD), (ple...",'interest'
7015,FED ARRANGES THREE-DAY SYSTEM REPOS,The Federal Reserve entered the\ngovernment se...,1987-10-19 11:55:16.350000,['interest'],[usa],21285,unknown,feder reserv enter govern secur market arrang ...,"[(feder, NN), (reserv, NN), (enter, NN), (gove...",'interest'
7016,FDIC'S SEIDMAN SAYS HIGHER RATES COULD HARM BANKS,Federal Deposit Insurance Corp Chairman\nWilli...,1987-10-19 11:09:59.740000,['interest'],[usa],21342,unknown,feder deposit insur corp chairman william seid...,"[(feder, NN), (deposit, NN), (insur, VBP), (co...",'interest'


In [31]:
type(little['topics_clean'])

pandas.core.series.Series

In [32]:
little['topics_clean']

0           'earn'
1           'earn'
2           'earn'
3           'earn'
4           'earn'
           ...    
7013    'interest'
7014    'interest'
7015    'interest'
7016    'interest'
7017    'interest'
Name: topics_clean, Length: 7018, dtype: object

In [33]:
# Function to remove quotation marks
def remove_comillas(string):
    comillas = "''"
    string = "".join([i for i in string if i not in comillas])
    return string

In [34]:
#Removing quotation marks
little['topics_clean'] = little['topics_clean'].apply(lambda x: remove_comillas(x))

In [35]:
little

Unnamed: 0,title,body,date,topics,places,id,organisations,clean_text,postag_text,topics_clean
0,CHAMPION PRODUCTS <CH> APPROVES STOCK SPLIT,Champion Products Inc said its\nboard of direc...,1987-02-26 15:17:11.200000,['earn'],[usa],9,unknown,champion product inc said board director appro...,"[(champion, NN), (product, NN), (inc, NN), (sa...",earn
1,COBANCO INC <CBCO> YEAR NET,"Shr 34 cts vs 1.19 dlrs\n Net 807,000 vs 2,...",1987-02-26 15:18:59.340000,['earn'],[usa],11,unknown,shr ct vs dlr net vs asset mln vs mln deposit ...,"[(shr, NN), (ct, NN), (vs, JJ), (dlr, JJ), (ne...",earn
2,AM INTERNATIONAL INC <AM> 2ND QTR JAN 31,Oper shr loss two cts vs profit seven cts\n ...,1987-02-26 15:20:13.090000,['earn'],[usa],13,unknown,oper shr loss two ct vs profit seven ct oper s...,"[(oper, IN), (shr, JJ), (loss, NN), (two, CD),...",earn
3,BROWN-FORMAN INC <BFD> 4TH QTR NET,Shr one dlr vs 73 cts\n Net 12.6 mln vs 15....,1987-02-26 15:20:27.170000,['earn'],[usa],14,unknown,shr one dlr vs ct net mln vs mln rev mln vs ml...,"[(shr, VB), (one, CD), (dlr, NN), (vs, NN), (c...",earn
4,DEAN FOODS <DF> SEES STRONG 4TH QTR EARNINGS,Dean Foods Co expects earnings for the\nfourth...,1987-02-26 15:26:26.780000,['earn'],[usa],18,unknown,dean food co expect earn fourth quarter end ma...,"[(dean, NN), (food, NN), (co, NN), (expect, VB...",earn
...,...,...,...,...,...,...,...,...,...,...
7013,LAWSON SAYS UK INTEREST RATE PROSPECTS UNCHANGED,U.K. Chancellor of the Exchequer Nigel\nLawson...,1987-10-20 14:16:22.140000,['interest'],[uk],20532,unknown,uk chancellor exchequ nigel lawson said collap...,"[(uk, JJ), (chancellor, NN), (exchequ, NN), (n...",interest
7014,REAGAN SEES ROOM FOR INTEREST RATE DECLINES,President Reagan said he was pleased\nwith the...,1987-10-20 17:16:07.650000,['interest'],[usa],20769,unknown,presid reagan said pleas action bank reduc pri...,"[(presid, NN), (reagan, NN), (said, VBD), (ple...",interest
7015,FED ARRANGES THREE-DAY SYSTEM REPOS,The Federal Reserve entered the\ngovernment se...,1987-10-19 11:55:16.350000,['interest'],[usa],21285,unknown,feder reserv enter govern secur market arrang ...,"[(feder, NN), (reserv, NN), (enter, NN), (gove...",interest
7016,FDIC'S SEIDMAN SAYS HIGHER RATES COULD HARM BANKS,Federal Deposit Insurance Corp Chairman\nWilli...,1987-10-19 11:09:59.740000,['interest'],[usa],21342,unknown,feder deposit insur corp chairman william seid...,"[(feder, NN), (deposit, NN), (insur, VBP), (co...",interest


In [36]:
#Checking how much data we have left from the original dataset
percentage1 = round((little.shape[0]/before)*100,2)
print("The percentage left of data is: ",percentage1,"%")

The percentage left of data is:  32.52 %


# SEGUIR CHECANDO EL TFIDF, TAGGS OCCURENCE Y CHECAR LA RED NEURONAL

In [37]:
#Choosing the column in which we might work
nice = little[['body','clean_text', 'postag_text', 'topics_clean']]

In [38]:
nice

Unnamed: 0,body,clean_text,postag_text,topics_clean
0,Champion Products Inc said its\nboard of direc...,champion product inc said board director appro...,"[(champion, NN), (product, NN), (inc, NN), (sa...",earn
1,"Shr 34 cts vs 1.19 dlrs\n Net 807,000 vs 2,...",shr ct vs dlr net vs asset mln vs mln deposit ...,"[(shr, NN), (ct, NN), (vs, JJ), (dlr, JJ), (ne...",earn
2,Oper shr loss two cts vs profit seven cts\n ...,oper shr loss two ct vs profit seven ct oper s...,"[(oper, IN), (shr, JJ), (loss, NN), (two, CD),...",earn
3,Shr one dlr vs 73 cts\n Net 12.6 mln vs 15....,shr one dlr vs ct net mln vs mln rev mln vs ml...,"[(shr, VB), (one, CD), (dlr, NN), (vs, NN), (c...",earn
4,Dean Foods Co expects earnings for the\nfourth...,dean food co expect earn fourth quarter end ma...,"[(dean, NN), (food, NN), (co, NN), (expect, VB...",earn
...,...,...,...,...
7013,U.K. Chancellor of the Exchequer Nigel\nLawson...,uk chancellor exchequ nigel lawson said collap...,"[(uk, JJ), (chancellor, NN), (exchequ, NN), (n...",interest
7014,President Reagan said he was pleased\nwith the...,presid reagan said pleas action bank reduc pri...,"[(presid, NN), (reagan, NN), (said, VBD), (ple...",interest
7015,The Federal Reserve entered the\ngovernment se...,feder reserv enter govern secur market arrang ...,"[(feder, NN), (reserv, NN), (enter, NN), (gove...",interest
7016,Federal Deposit Insurance Corp Chairman\nWilli...,feder deposit insur corp chairman william seid...,"[(feder, NN), (deposit, NN), (insur, VBP), (co...",interest


In [39]:
#Used to encode the topics classes into numbers
from sklearn import preprocessing

In [40]:
#Calling the method LabelEncoder
le = preprocessing.LabelEncoder()

In [41]:
#Encoding the topics classes into numbers
nice['topics_clean'] = le.fit_transform(nice.topics_clean.values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [42]:
nice

Unnamed: 0,body,clean_text,postag_text,topics_clean
0,Champion Products Inc said its\nboard of direc...,champion product inc said board director appro...,"[(champion, NN), (product, NN), (inc, NN), (sa...",2
1,"Shr 34 cts vs 1.19 dlrs\n Net 807,000 vs 2,...",shr ct vs dlr net vs asset mln vs mln deposit ...,"[(shr, NN), (ct, NN), (vs, JJ), (dlr, JJ), (ne...",2
2,Oper shr loss two cts vs profit seven cts\n ...,oper shr loss two ct vs profit seven ct oper s...,"[(oper, IN), (shr, JJ), (loss, NN), (two, CD),...",2
3,Shr one dlr vs 73 cts\n Net 12.6 mln vs 15....,shr one dlr vs ct net mln vs mln rev mln vs ml...,"[(shr, VB), (one, CD), (dlr, NN), (vs, NN), (c...",2
4,Dean Foods Co expects earnings for the\nfourth...,dean food co expect earn fourth quarter end ma...,"[(dean, NN), (food, NN), (co, NN), (expect, VB...",2
...,...,...,...,...
7013,U.K. Chancellor of the Exchequer Nigel\nLawson...,uk chancellor exchequ nigel lawson said collap...,"[(uk, JJ), (chancellor, NN), (exchequ, NN), (n...",3
7014,President Reagan said he was pleased\nwith the...,presid reagan said pleas action bank reduc pri...,"[(presid, NN), (reagan, NN), (said, VBD), (ple...",3
7015,The Federal Reserve entered the\ngovernment se...,feder reserv enter govern secur market arrang ...,"[(feder, NN), (reserv, NN), (enter, NN), (gove...",3
7016,Federal Deposit Insurance Corp Chairman\nWilli...,feder deposit insur corp chairman william seid...,"[(feder, NN), (deposit, NN), (insur, VBP), (co...",3


In [43]:
#How many element we have per each class
nice.topics_clean.value_counts()

2    3735
0    2125
1     355
5     333
4     259
3     211
Name: topics_clean, dtype: int64

### POS TAGGING

In [44]:
class pos_tagging(object):
    
    #Getting lexico
    def get_lexic(self, tokens, lexic):
        for word in tokens:
            if word in lexic.keys():
                lexic[word] += 1
            else:
                lexic[word] = 1
        return lexic
    
    #Given on the frecuency of each item in lexico, it gets its probability
    def get_lexic_prob(self,lexic):
        lexic_prob = lexic
        lexic_lenght = sum(lexic_prob.values())
        for word in lexic:
            lexic_prob[word] = lexic_prob[word]/lexic_lenght
        return lexic_prob
    
    #Grabbing tags
    def grab_pos_tag(self, string):
        pos = []
        for i in string:
            pos_token = nltk.pos_tag(i.split())
            pos.append(pos_token[0][1])
        return pos
    
    #Bigrams and probabilities of the chains method
    def get_lex_bi(self, pos_tag):
        lexic = {}
        total = 0
        for bigram in list(bigrams(pos_tag)):
            if bigram in lexic.keys():
                    lexic[bigram] += 1
            else:
                lexic[bigram] = 1
            total +=1 
        for i in lexic.keys():
            lexic[i] = lexic[i]/total
            
        lexic_sor = sorted(lexic.items(), key=operator.itemgetter(1), reverse=True)
        return lexic_sor

In [45]:
lexical = pos_tagging()
lexic = {}
pos_tag_list = []

In [46]:
for i in df["clean_text"]:
    clean_list =list(i)
    tokenizer = word_tokenize(("".join(clean_list)))
    lexic.update(lexical.get_lexic(tokenizer, lexic))
    lexic_prob = lexical.grab_pos_tag(tokenizer)
    pos_tag_list += lexic_prob

KeyboardInterrupt: 

In [None]:
#Making copy of lexic
wordsbag = lexic.copy()
#Get probability of word appeared on lexic
lexic_prob = lexical.get_lexic_prob(lexic) 
bigram_prob = lexical.get_lexic(pos_tag_list)

In [None]:
class get_tfidf:
    def TF(item, lexic):
        lexic_len = len(lexic)
        tf = {}
        for word, count in item.items():
            tf[word] = count / float(lexic_len)
        return tf

    def IDF(item):
        len_item = len(item)
        IDF = dict.fromkeys(item[0].keys(), 0)
        for document in item:
            for i, j in document.items():
                if j > 0:
                    IDF[i] += 1
        for i, j in IDF.items():
            IDF[i] = math.log(len_item / float(j))
        return IDF

    def TFIDF(item, idf):
        tf_idf = {}
        for i, j in item.items():
            tf_idf[i] = j * idf[i]
        return tf_idf

In [None]:
lexic

In [None]:
BagOfWords = set(lexic)
posdic = dict.fromkeys(BagOfWords, 0) 
for i in lexic:
    posdic[i] += 1

In [None]:
tfpos = get_tfidf.TF(posdic, BagOfWords)
tfidfpos = get_tfidf.TFIDF(tfpos, tfpos)

## MAY BE NOT

In [None]:
"""#Choosing the column for doing the pos tagging tasks
string = nice['clean_text']"""

In [None]:
"""class post_tag(object):

    def __init__(self, pos_tag_list=[], lexico_dict={}):
        self.pos_tag_list = pos_tag_list
        self.lexico_dict = lexico_dict

    #Calling methods
    def work(self, string):
        #pos_tag = []
        #lexico = {}
        for i in string:
            mylist = list(i)
            mylist = word_tokenize(("".join(i)))
            self.lexico_dict.update(self.lexicon(mylist,self.lexico_dict))
            bigram_prob = self.pos_tag
            lexic_prob = self.prob_lexico
            temp_pos=self.pos_tag(mylist)
            self.pos_tag_list+=temp_pos
        return self, bigram_prob, lexic_prob

    #Getting lexico
    def lexicon(self, mylist, lexico_dict):
        for word in mylist:
            if word in lexico_dict.keys():
                lexico_dict[word] += 1
            else:
                lexico_dict[word] = 1
        return lexico_dict

    #Given on the frecuency of each item in lexico, it gets its probability
    def prob_lexico(self,lexico_dict):
        lex_prob = lexico_dict
        len_lex = sum(lex_prob.values())
        for word in lexico_dict:
            lex_prob[word] = lex_prob[word]/len_lex
        return lex_prob

    #Grabbing tags
    def pos_tag(self,string):
        POS,Token=[],[]
        for word in string:
            pos_token = nltk.pos_tag(word.split())
            POS.append(pos_token[0][1])
        return POS

    #Getting bigrams and probabilities
    def bigrams_prob(self, pos_tag):
        lexic = {}
        total=0
        for bigram in list(bigrams(POS_tag)):
            if bigram in lexic.keys():
                    lexic[bigram] += 1
            else:
                lexic[bigram] = 1
            total+=1
        for i in lexic.keys():
            lexic[i]= lexic[i]/total
            
        sorted_lexic = sorted(lexic.items(), key=operator.itemgetter(1), reverse=True)
        return sorted_lexic"""

In [None]:
#postaging = post_tag()

In [None]:
#mylist = []
#lexico_dict = {}

In [None]:
"""mylist = []
lexico_dict = {}
for i in nice['clean_text']:
    mylist = list(i)
    tokenizer = word_tokenize(("".join(mylist)))
    lexico_dict.update(postaging.lexicon(mylist,lexico_dict))
    temp_pos=postaging.pos_tag(tokenizer)
    mylist += temp_pos"""

In [None]:
#lexico_dict

In [None]:
#temp_pos

In [None]:
"""a = postaging.lexicon(mylist,lexico_dict)
a"""

In [None]:
#check, uno, dos = postaging.work(string)

In [None]:
#uno

In [None]:
#import pickle

In [None]:
#pickle.dump(df, open('unit1.pickle','wb'))

In [None]:
#clean = list(df['clean_text'])

In [None]:
"""all_tags = []
tags = nltk.pos_tag(clean)
tags.insert(0,(" ",'-s-'))
all_tags.append(tags)"""

In [None]:
#all_tags

In [None]:
"""def get_pos(string):
    string = nltk.word_tokenize(string)
    pos_string = nltk.pos_tag(string)
    return pos_string"""

In [None]:
#clean = list(df['clean_text'])

In [None]:
#get_pos(clean)

In [None]:
#get_pos(clean)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df.clean_text.tolist())
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
tfidf = pd.DataFrame(denselist, columns=feature_names)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(tfidf, df.topics_clean)

In [None]:
clf.score(tfidf, df.topics_clean)