# Introductory steps: loding all relevant packages, loading the dataset and cleaning it

First, I install and load all of the relevant packages. 

In [1]:
!pip install embeddingvectorizer
import re
import pickle
import numpy as np
import pandas as pd
import regex
import re
import joblib
import nltk
import gensim
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load("W2V.kv", mmap='r+')
vocabs = word_vectors.index_to_key
vectors = word_vectors.vectors
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from embeddingvectorizer import EmbeddingCountVectorizer, EmbeddingTfidfVectorizer
import embeddingvectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn import metrics
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download("stopwords")
nltk.download("punkt")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\franc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\franc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Then, I load the document and I have a quick look at it

In [2]:
data = pd.read_csv("labeled.csv")
data.head()

Unnamed: 0,title,blurb,topic
0,Ora parla l’economista francese e avverte Ue: ...,"La Commissione europea, con l’Italia, sta gioc...",Economy
1,FB chiede alle banche i dati dei clienti - Int...,Facebook ha chiesto alle maggiori banche ameri...,Other
2,"Poste Italiane: partono le Domande, in tutta I...",Affinché i Portalettere in servizio possano an...,Other
3,"Aspirina pericolosa per i malatti di cuore, ri...",Il ministero della Salute Britannico ha invita...,Politics
4,"Il San Carlo a Milano, Mehta: «Spero ci siano ...",L’orchestra e il coro del Massimo in concerto ...,Politics


First, I check whether the text contains E-mail addresses, HTML tags and HTML Character escapes and I remove them. I do not drop the numbers and the stopwords as that will be done automatically with my tokenizer. Then, I check that all of the characters were actually removed.

In [3]:
data.topic.str.contains(r"https?://[\w\.]+\b|www\.[\w\.]+\b").sum()

0

In [4]:
data.topic.str.contains(r"</?\w[^>]*>").sum()

0

In [5]:
data.topic.str.contains(r"&[^;]+;").sum()

0

In [6]:
data.title.str.contains(r"https?://[\w\.]+\b|www\.[\w\.]+\b").sum()

9

In [7]:
data.title.str.contains(r"</?\w[^>]*>").sum()

0

In [8]:
data.title.str.contains(r"&[^;]+;").sum()

2

In [9]:
data.blurb.str.contains(r"https?://[\w\.]+\b|www\.[\w\.]+\b").sum()

9

In [10]:
data.blurb.str.contains(r"</?\w[^>]*>").sum()

2

In [11]:
data.blurb.str.contains(r"&[^;]+;").sum()

94

In [12]:
data['title'] = data['title'].str.replace(r"&[^;]+;", '', regex=True)
data['title'] = data['title'].str.replace(r"https?://[\w\.]+\b|www\.[\w\.]+\b", '', regex=True)
data['blurb'] = data['blurb'].str.replace(r"&[^;]+;", '', regex=True)
data['blurb'] = data['blurb'].str.replace(r"</?\w[^>]*>", '', regex=True)
data['blurb'] = data['blurb'].str.replace(r"https?://[\w\.]+\b|www\.[\w\.]+\b", '', regex=True)

In [13]:
data.title.str.contains(r"https?://[\w\.]+\b|www\.[\w\.]+\b").sum()

0

In [14]:
data.title.str.contains(r"&[^;]+;").sum()

0

In [15]:
data.blurb.str.contains(r"https?://[\w\.]+\b|www\.[\w\.]+\b").sum()

0

In [16]:
data.blurb.str.contains(r"</?\w[^>]*>").sum()

0

In [17]:
data.blurb.str.contains(r"&[^;]+;").sum()

0

Now I check whether there are NAs. I see that there are 11. Therefore, I check where they are contained and I remove them. Then, I check that everything went well. 

In [18]:
data.isnull().sum().sum()

11

In [19]:
data['title'].isnull().sum()

0

In [20]:
data['blurb'].isnull().sum()

11

In [21]:
data['topic'].isnull().sum()

0

In [22]:
data = data.dropna(subset=['blurb'])

In [23]:
data.isnull().sum().sum()

0

Now, I join the columns blurb and text together. I do so, so that the classifier will be trained only on one column. Instead of keeping only one of them, I combine them together so that the classifier will have more text to be trained on. Then, I remove the columns blurb and title as they are not important anymore. I also check what values the column topic containes and then I change all of the values that are different from politics to other, as I want the classifier to distinguish only political news from all other types of news (therefore, the types of the other news are not relevant anymore). I then check that everything went well. 

In [24]:
data['text'] = data['title'] + ' ' + data['blurb']
data = data.drop(columns=["title", "blurb"])
data["topic"].unique()

array(['Economy', 'Other', 'Politics', 'CrimeDisaster', 'Entertainment',
       'Culture', 'ScienceTech', 'Sports'], dtype=object)

In [25]:
data['topic'] = data['topic'].replace(['Economy', 'CrimeDisaster', 'Entertainment', 'Culture', 'ScienceTech', 'Sports'], 'Other')
data["topic"].unique()

array(['Other', 'Politics'], dtype=object)

In [26]:
data.head()

Unnamed: 0,topic,text
0,Other,Ora parla l’economista francese e avverte Ue: ...
1,Other,FB chiede alle banche i dati dei clienti - Int...
2,Other,"Poste Italiane: partono le Domande, in tutta I..."
3,Politics,"Aspirina pericolosa per i malatti di cuore, ri..."
4,Politics,"Il San Carlo a Milano, Mehta: «Spero ci siano ..."


Then, I create my tokenizer. 

# Creating the tokenizer and splitting the datset

I create my tokenizer and choose Italian as a language as the texts are in Italian

In [27]:
class MyTokenizer:
    def tokenize(self, text):
        result = []
        word =  r"\p{letter}"
        tokens = nltk.word_tokenize(text, language = "Italian")
        tokens = [t for t in tokens if regex.search(word, t)]
        result += tokens
        return result
    
mytokenizer = MyTokenizer()

Now, I divide the data into two subsets: a train a and a test subset. The test subset is set aside to test the classifier after it has been traied on the train subset (this will be done later). I check how the tokenizer perform with 2 sentences taken from the data. It performs quite well: punctuation and numebrs are removed, articles and words are identified well. 

In [28]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["topic"], test_size=0.2, random_state=5)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

sentence = "La Commissione europea, con lâ€™Italia, sta giocando &qout;un gioco pericoloso&qout;. A dirlo Ã¨ Steve Ohana, economista francese della Escp School. &qout;Non reagire alla manovra italiana avrebbe significato una grossa perdita di credibilitÃ  per la Commissione e, piÃ¹ in generale, per le istituzio..."
sentence2 = "L&#39;incidente nel pomeriggio a Caramanico Terme, in localit&agrave; San Tommaso (Pescara). Le vittime, una coppia, erano in gita con i figli di 5 e 8 anni&nbsp; e alcuni amici. Buio e zona impervia rendono difficile il recupero delle salme"
sentence3 = "Brno, doppietta Ducati: Dovizioso piega Lorenzo. 3° Marquez, poi Rossi Splendida vittoria del forlivese che regge agli assalti dei due spagnoli e conquista il secondo successo stagionale dopo un intenso duello. Vale chiude ai piedi del podio"
print(mytokenizer.tokenize(sentence2))
print(mytokenizer.tokenize(sentence))
print(mytokenizer.tokenize(sentence3))

(2056,) (2056,)
(514,) (514,)
['L', 'incidente', 'nel', 'pomeriggio', 'a', 'Caramanico', 'Terme', 'in', 'localit', 'agrave', 'San', 'Tommaso', 'Pescara', 'Le', 'vittime', 'una', 'coppia', 'erano', 'in', 'gita', 'con', 'i', 'figli', 'di', 'e', 'anni', 'nbsp', 'e', 'alcuni', 'amici', 'Buio', 'e', 'zona', 'impervia', 'rendono', 'difficile', 'il', 'recupero', 'delle', 'salme']
['La', 'Commissione', 'europea', 'con', 'lâ€™Italia', 'sta', 'giocando', 'qout', 'un', 'gioco', 'pericoloso', 'qout', 'A', 'dirlo', 'Ã¨', 'Steve', 'Ohana', 'economista', 'francese', 'della', 'Escp', 'School', 'qout', 'Non', 'reagire', 'alla', 'manovra', 'italiana', 'avrebbe', 'significato', 'una', 'grossa', 'perdita', 'di', 'credibilitÃ', 'per', 'la', 'Commissione', 'e', 'piÃ¹', 'in', 'generale', 'per', 'le', 'istituzio']
['Brno', 'doppietta', 'Ducati', 'Dovizioso', 'piega', 'Lorenzo', 'Marquez', 'poi', 'Rossi', 'Splendida', 'vittoria', 'del', 'forlivese', 'che', 'regge', 'agli', 'assalti', 'dei', 'due', 'spagnoli', 

Then, I didive the train split that I just created again. I will use this new train/validation split to analyse the various vectorizers and classifiers. 


In [29]:
X_train_new, X_val, y_train_new, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=5)
print(X_train_new.shape, y_train_new.shape)
print(X_val.shape, y_val.shape)

(1644,) (1644,)
(412,) (412,)


# Testing the models with BOW approach and beyond BOW approach

Now, I check how various models work.

In [30]:
models = [
    ("NB-count", CountVectorizer(tokenizer=mytokenizer.tokenize, min_df=5, max_df=.5), MultinomialNB()),
    ("NB-TfIdf", TfidfVectorizer(min_df=5, max_df=.5), MultinomialNB()),
    (
        "LR-Count", CountVectorizer(tokenizer=mytokenizer.tokenize, min_df=5, max_df=.5),
        LogisticRegression(solver="liblinear"),
    ),
    (
        "LR-TfIdf", TfidfVectorizer(tokenizer=mytokenizer.tokenize, min_df=5, max_df=.5),
        LogisticRegression(solver="liblinear"),
    ),
    (
        "SVC-count", CountVectorizer(tokenizer=mytokenizer.tokenize, min_df=5, max_df=.5),
        SVC(gamma="scale"),
    ),
    (   "SVC-TfIdf", TfidfVectorizer(tokenizer=mytokenizer.tokenize, min_df=5, max_df=.5),
        SVC(gamma="scale"),
    ),
    (
        "RF-Count", CountVectorizer(tokenizer=mytokenizer.tokenize, min_df=5, max_df=.5),
        RandomForestClassifier(n_estimators=100),
    ),
    (   "RF-TfIdf", TfidfVectorizer(tokenizer=mytokenizer.tokenize, min_df=5, max_df=.5),
        RandomForestClassifier(n_estimators=100),
    ),
]

for name, vectorizer, classifier in models:
    print(name)
    X_train2 = vectorizer.fit_transform(X_train_new)
    X_test2 = vectorizer.transform(X_val)
    classifier.fit(X_train2, y_train_new)
    y_pred = classifier.predict(X_test2)
    print(metrics.accuracy_score(y_val, y_pred))
    print("\n")

NB-count




0.7766990291262136


NB-TfIdf
0.7742718446601942


LR-Count




0.7597087378640777


LR-TfIdf




0.7669902912621359


SVC-count




0.7475728155339806


SVC-TfIdf




0.7694174757281553


RF-Count




0.7597087378640777


RF-TfIdf




0.779126213592233




I see that none of the values are above .8, which is what we should strive for. Therefore, I use a non-BOW approach to see if I can achieve better accuracy scores. I decided to use word embeddings because I think that they are quite useful in analysing Italian texts and news healdlines dealing with politics. Word embeddings are a powerful tool for natural language processing, as they allow you to represent words in a high-dimensional space, where words with similar meanings are located close to each other. This moves beyong the BOW approaches that takes into account simply the frequency of words. With word embeddings the classifier can be trained to find synonyms and analogies. Hence, this can help improve the accuracy of machine learning models by providing them with more informative features. I especially think that they are useful to analyse Italian, as it is a language rich of synonyms, analogies and comparisons. Differently form english, which is a quite straightforward and 'simplistic' language, Italian is a more nuanced and rich language. First, I create a dataframe with the words and their asisgned values and then I create a dict which I will then use to test various combiantions of vectorizers and classifiers with word embeddings. I try the two most common vectorizers (Count and Tfidf) and the main models that we studied so far. Because Naive Bayes cannot handle negative numbers, I do not include it in the mdoels tested. First, I check that the word embeddings work well. Moreover, I include random forests and not decision trees as, overall, the former perform better than the latter ones. 

In [31]:
wvdf = pd.DataFrame(word_vectors.vectors, index=word_vectors.index_to_key)
word_vectors2_dict = dict(zip(vocabs, vectors))
wvdf.sort_values(0, ascending=False).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
amphibiaweb.org,2.383386,1.243215,0.782704,-0.691165,-0.080835,-0.638561,0.840422,-0.20075,-0.346297,-0.17731,...,0.180366,-0.216721,-0.936201,0.1303,-0.004838,0.21158,0.984754,1.653805,0.114008,-0.441808
andegavorum,2.299073,-0.154413,0.084963,-1.31742,-0.928468,0.823194,0.533603,1.324872,0.344809,0.539048,...,1.051035,0.42123,-0.626271,0.624682,0.08076,0.849758,0.217712,0.712347,0.353163,0.887824
reiske,2.295041,0.204441,-0.02276,-0.315606,-0.029753,0.225191,0.215381,0.882625,0.238328,0.765809,...,-0.30581,-0.304357,0.177891,0.143306,-0.13965,0.563671,-0.424972,-0.311885,-0.303003,0.35319
virginem,2.273339,0.7943,-0.029058,-0.793297,-0.021337,-0.441115,-0.172477,0.717569,-0.402575,-0.233796,...,-0.220241,0.241052,0.109762,0.646236,0.351735,-0.240822,-0.089408,0.772591,-0.059612,-0.36722
bruys,2.262714,0.225114,0.016751,-0.411804,-0.056492,0.211376,0.192823,0.057007,0.814759,-0.872253,...,-0.257632,-0.106817,0.672275,-0.705625,1.161071,0.008391,0.24617,0.354919,0.059887,0.349734


In [32]:
modelsWE = [
    (
        "LR-Count",
        embeddingvectorizer.EmbeddingCountVectorizer(word_vectors2_dict, operator='sum'),
        LogisticRegression(solver="liblinear"),
    ),
    (
        "LR-TfIdf",
        embeddingvectorizer.EmbeddingTfidfVectorizer(word_vectors2_dict, operator='sum'),
        LogisticRegression(solver="liblinear"),
    ),
    (
        "SVC-count",
        embeddingvectorizer.EmbeddingCountVectorizer(word_vectors2_dict, operator='sum'),
        SVC(gamma="scale"),
    ),
    (   "SVC-TfIdf",
        embeddingvectorizer.EmbeddingTfidfVectorizer(word_vectors2_dict, operator='sum'),
        SVC(gamma="scale"),
    ),
    (
        "RF-Count",
        embeddingvectorizer.EmbeddingCountVectorizer(word_vectors2_dict, operator='sum'),
        RandomForestClassifier(n_estimators=100),
    ),
    (   "RF-TfIdf",
        embeddingvectorizer.EmbeddingTfidfVectorizer(word_vectors2_dict, operator='sum'),
        RandomForestClassifier(n_estimators=100),
    ),
]

for name, vectorizer, classifier in modelsWE:
    print(name)
    X_train3 = np.array(list(vectorizer.fit_transform(X_train_new)))
    X_test3 = np.array(list(vectorizer.transform(X_val)))
    classifier.fit(X_train3, y_train_new)
    y_pred = classifier.predict(X_test3)
    print(metrics.accuracy_score(y_val, y_pred))
    print("\n")

LR-Count
0.7669902912621359


LR-TfIdf
0.8131067961165048


SVC-count
0.8155339805825242


SVC-TfIdf
0.8179611650485437


RF-Count
0.7936893203883495


RF-TfIdf
0.7766990291262136




# Tuning the models and choosing the final model

As expected, word embedding improve most the models' performance quite much. I see that the SVC-TfIdf and the SVC-count models are the one performing the best (they have the highest accuracy score) and, therefore, I will tune these one model. I will only look at parameters C as my computer could not handle any research with more parameters (such as kernel and grade) and the same occurred with collab. As value C controls the tradeoff between the complexity of the decision boundary and the amount of classification error that is permitted, testing different values of C is important because it allows us to find the best tradeoff between overfitting and underfitting the data, and thus, helps to obtain better generalization performance on unseen data. Then, I choose my best model and provide a final classification report generated on the test data set aside at the beginning. 
Unfortunately my laptop crashed multiple times when running the search grid (I think because it does not have enough memory). I was able to run it once, but not again. Therefore, I copy-pasted here the code that I used and the output that I got. 

pipeline = Pipeline(
    steps=[
        ("vectorizer", embeddingvectorizer.EmbeddingCountVectorizer(word_vectors2_dict, operator='mean')),
        ("classifier", SVC(gamma="scale")),
    ]
)
grid = {
    "classifier__C": [0.01, 1, 100],
}
search = GridSearchCV(
    estimator=pipeline, n_jobs=-1, param_grid=grid, scoring="accuracy", cv=5
)
search.fit(X_train, y_train)

print(f"Best parameters: {search.best_params_}")
print(f"Best accuracy: {search.best_score_}")

Best parameters: {'classifier__C': 1}
0.8131067961165048


pipeline = Pipeline(
    steps=[
        ("vectorizer", embeddingvectorizer.EmbeddingTfidfVectorizer(word_vectors2_dict, operator='mean')),
        ("classifier", SVC(gamma="scale")),
    ]
)
grid = {
    "classifier__C": [0.01, 1, 100],
}
search = GridSearchCV(
    estimator=pipeline, n_jobs=-1, param_grid=grid, scoring="accuracy", cv=5
)
search.fit(X_train, y_train)

print(f"Best parameters: {search.best_params_}")
print(f"Best accuracy: {search.best_score_}")

Best parameters: {'classifier__C': 1}
Best accuracy: 0.8180922684430586


As the SVC model witht he count vectorizer turns out to be slight more accurate, I choose this as my final model to test the test data that I set out at the beginning. Now, I don't look only at the accuracy score, but I also generate a classification report. This gives a better overview as it shows the precision, recall and F1 values for each column. 

In [33]:
final_pipeline = Pipeline(
    steps=[
        ("vectorizer", embeddingvectorizer.EmbeddingCountVectorizer(word_vectors2_dict, operator='mean')),
        ("classifier", SVC(gamma="scale", C=1)),
    ]
)

X_train10 = np.array(list(vectorizer.fit_transform(X_train)))
X_test10 = np.array(list(vectorizer.transform(X_test)))
classifier.fit(X_train10, y_train)
y_pred = classifier.predict(X_test10)
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print("\n")

0.7898832684824902
              precision    recall  f1-score   support

       Other       0.77      0.92      0.84       303
    Politics       0.84      0.61      0.70       211

    accuracy                           0.79       514
   macro avg       0.80      0.76      0.77       514
weighted avg       0.80      0.79      0.78       514





# Loading the unlabeled dataset and cleaning it

Now, I load the unlabeled data and clean it as I did for the labeled data.

In [34]:
unlabeled = pd.read_csv("unlabeled.csv")
unlabeled.head()

Unnamed: 0,title,blurb
0,"Nuoto, Dall?Aglio morto in palestra","Nuoto la tragedia Nuoto, Dall’Aglio morto in p..."
1,"È ufficiale, Valentino Rossi rinnova con la Ya...","""Il pesarese ha firmato con la Casa di Iwata u..."
2,Real Madrid-Juve: minacce a moglie dell'arbitr...,"""Gli insulti sul profilo Twitter della consort..."
3,"""Salvini su Balotelli: """"Capitano della Nazion...","""Il ministro dell'Interno e vicepremier: """"Spe..."
4,"""Juve, Tardelli e la Coppa vinta all'Heysel: ""...","""L'ex bianconero, in campo nel 1985 contro il ..."


In [35]:
unlabeled.title.str.contains(r"https?://[\w\.]+\b|www\.[\w\.]+\b").sum()

50

In [36]:
unlabeled.blurb.str.contains(r"https?://[\w\.]+\b|www\.[\w\.]+\b").sum()

68

In [37]:
unlabeled.title.str.contains(r"</?\w[^>]*>").sum()

0

In [38]:
unlabeled.title.str.contains(r"&[^;]+;").sum()

1

In [39]:
unlabeled.blurb.str.contains(r"</?\w[^>]*>").sum()

6

In [40]:
unlabeled.blurb.str.contains(r"&[^;]+;").sum()

923

In [41]:
unlabeled['title'] = unlabeled['title'].str.replace(r"&[^;]+;", '', regex=True)
unlabeled['title'] = unlabeled['title'].str.replace(r"https?://[\w\.]+\b|www\.[\w\.]+\b", '', regex=True)
unlabeled['blurb'] = unlabeled['blurb'].str.replace(r"&[^;]+;", '', regex=True)
unlabeled['blurb'] = unlabeled['blurb'].str.replace(r"</?\w[^>]*>", '', regex=True)
unlabeled['blurb'] = unlabeled['blurb'].str.replace(r"https?://[\w\.]+\b|www\.[\w\.]+\b", '', regex=True)

In [42]:
unlabeled.blurb.str.contains(r"&[^;]+;").sum()

0

In [43]:
unlabeled.blurb.str.contains(r"</?\w[^>]*>").sum()

0

In [44]:
unlabeled.title.str.contains(r"&[^;]+;").sum()

0

In [45]:
unlabeled.blurb.str.contains(r"https?://[\w\.]+\b|www\.[\w\.]+\b").sum()

0

In [46]:
unlabeled.title.str.contains(r"https?://[\w\.]+\b|www\.[\w\.]+\b").sum()

0

In [47]:
unlabeled.isnull().sum().sum()

144

In [48]:
unlabeled['title'].isnull().sum()

35

In [49]:
unlabeled = unlabeled.dropna(subset=['blurb'])

In [50]:
unlabeled = unlabeled.dropna(subset=['title'])

In [51]:
unlabeled.isnull().sum().sum()

0

In [52]:
unlabeled['text'] = unlabeled['title'] + ' ' + unlabeled['blurb']
new_unlabeled = unlabeled.drop(columns=["title", "blurb"])
new_unlabeled.head()

Unnamed: 0,text
0,"Nuoto, Dall?Aglio morto in palestra Nuoto la t..."
1,"È ufficiale, Valentino Rossi rinnova con la Ya..."
2,Real Madrid-Juve: minacce a moglie dell'arbitr...
3,"""Salvini su Balotelli: """"Capitano della Nazion..."
4,"""Juve, Tardelli e la Coppa vinta all'Heysel: ""..."


labels = {1: "political", 0: "Other"}
data["topic"] = data["topic"].replace(labels)
predictions = final_pipeline.predict(unlabeled["text"])
unlabeled["predictions"] = predictions

# Fitting the models to the unlabeled data and using it to predict the topics of the articles

Now, I create the labels for the topic of the unlabeled dataset, I then use the model I chose as the best one to predict the topic for the unlabeled datatset.

In [53]:
final_pipeline.fit(data["text"], data["topic"])
labels = {1: "political", 0: "Other"}
data["topic"] = data["topic"].replace(labels)

predictions = final_pipeline.predict(new_unlabeled["text"])

new_unlabeled["predictions"] = predictions

In [54]:
new_unlabeled.head()

Unnamed: 0,text,predictions
0,"Nuoto, Dall?Aglio morto in palestra Nuoto la t...",Other
1,"È ufficiale, Valentino Rossi rinnova con la Ya...",Other
2,Real Madrid-Juve: minacce a moglie dell'arbitr...,Other
3,"""Salvini su Balotelli: """"Capitano della Nazion...",Politics
4,"""Juve, Tardelli e la Coppa vinta all'Heysel: ""...",Other


# Finally, saving the new dataset with the texts and the predicted topics

First, I convert the NumPy ndarray object into a dataframe and then I save it a csv file. 

In [55]:
df_predictions = pd.DataFrame(new_unlabeled)
df_predictions.to_csv("predicted_unlabeleddata.csv", index=False)