# Scikit-Learn

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Text feature extraction

In [3]:
df = pd.read_csv('df_copom_label.csv')

In [None]:
df.head(5)

Unnamed: 0,Date,Selic,Meeting_Number,Decision,Decision_txt,label_hawk_dove,label_next_meet,Text,Type
0,2006/03/08,16.5,117.0,-0.75,decrease,dovish,decrease,"In the March Meeting, the Banco Central do Br...",Statement
1,2006/04/19,15.75,118.0,-0.75,decrease,dovish,decrease,"In the April Meeting, the Monetary Policy Com...",Statement
2,2006/05/31,15.25,119.0,-0.5,decrease,dovish,decrease,"In the May Meeting, the Monetary Policy Commi...",Statement
3,2006/07/19,14.75,120.0,-0.5,decrease,dovish,decrease,"In the July Meeting, the Copom unanimously de...",Statement
4,2006/08/30,14.25,121.0,-0.5,decrease,dovish,decrease,"In the August Meeting, the Copom unanimously ...",Statement


In [None]:
df.isnull().sum()

Date               0
Selic              0
Meeting_Number     0
Decision           0
Decision_txt       0
label_hawk_dove    0
label_next_meet    0
Text               0
Type               0
dtype: int64

In [8]:
df['Decision'].value_counts()

 0.00    59
 0.50    21
-0.50    19
-0.25    12
 0.75    10
-0.75     9
 1.00     8
 0.25     7
-1.00     7
 1.50     6
-1.50     1
Name: Decision, dtype: int64

In [9]:
df['label_hawk_dove'].unique()

array(['dovish', 'hawkish', 'neutral'], dtype=object)

In [10]:
df['label_hawk_dove'].value_counts()

hawkish    58
dovish     54
neutral    47
Name: label_hawk_dove, dtype: int64

## Split the data

In [13]:
from sklearn.model_selection import train_test_split

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [14]:
X = df['Text']
y = df['label_hawk_dove']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=123)

## Build a Pipeline

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

text_clf.fit(X_train, y_train)  

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).e

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [17]:
predictions = text_clf.predict(X_test)

In [18]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[14  1  2]
 [ 3 15  2]
 [ 0  2  9]]


In [19]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

      dovish       0.82      0.82      0.82        17
     hawkish       0.83      0.75      0.79        20
     neutral       0.69      0.82      0.75        11

   micro avg       0.79      0.79      0.79        48
   macro avg       0.78      0.80      0.79        48
weighted avg       0.80      0.79      0.79        48



In [20]:
print(metrics.accuracy_score(y_test,predictions))

0.7916666666666666


## Scikit-learn's CountVectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
count_vect = CountVectorizer()

In [14]:
X_train_counts = count_vect.fit_transform(X_train)

In [15]:
X_train_counts

<111x2433 sparse matrix of type '<class 'numpy.int64'>'
	with 27992 stored elements in Compressed Sparse Row format>

## Transform Counts to Frequencies with Tf-idf

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(111, 2433)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_train_tfidf.shape

(111, 2433)

## Train a Classifier

In [23]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)