In [29]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD

import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics

import pandas as pd
import numpy as np
import os


path = "datasets/710527/"
df = pd.read_csv(path+"dataset_pop-by-likes.csv")
df = df.rename(columns={"text": "X", "pop_level": "y"})

df['X'] = df['X']+" &"+df['likes'].astype(str)
display(df)

y_col = 'y'
X_cols = df.columns
X_cols = X_cols.drop(y_col)

X_cols = X_cols.drop('likes')
X_cols = X_cols.drop('followers')
X_cols = X_cols.drop('impressions')
X_cols = X_cols.drop('popularity') ####

print(X_cols)

# split data
docs_train, docs_test, y_train, y_test = train_test_split(
    df.loc[:,X_cols], df.loc[:,y_col], test_size=0.5)

docs_train = docs_train['X']
docs_test = docs_test['X']

target_names = ['class 0', 'class 1', 'class 2']
#display(target_names.len)

# TASK: Build a vectorizer that splits strings into sequence of 1 to 3
# characters instead of word tokens
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char',
                             use_idf=False)

# TASK: Build a vectorizer / classifier pipeline using the previous analyzer
# the pipeline instance should stored in a variable named clf
clf = Pipeline([
    ('vec', vectorizer),
    ('clf', Perceptron()),
])

# TASK: Fit the pipeline on the training set
clf.fit(docs_train, y_train)

# TASK: Predict the outcome on the testing set in a variable named y_predicted
y_predicted = clf.predict(docs_test)

# Print the classification report
print(metrics.classification_report(y_test, y_predicted,
                                    target_names=target_names))

# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

#import matlotlib.pyplot as plt
#plt.matshow(cm, cmap=plt.cm.jet)
#plt.show()

# Predict the result on some short new sentences:
sentences = [
    'This is a language detection test.',
    'Ceci est un test de d\xe9tection de la langue.',
    'Dies ist ein Test, um die Sprache zu erkennen.',
]
predicted = clf.predict(sentences)

for s, p in zip(sentences, predicted):
    print('The language of "%s" is "%s"' % (s, target_names[p]))

Unnamed: 0,X,likes,impressions,followers,popularity,y
0,"Reduce your cravings, or better yet, replace w...",0.0,58.0,1873.0,0.278652,0
1,3 hours in the lunch tent now going home 🤪 &0.0,0.0,70.0,252.0,0.431823,0
2,I could really use a glass of melted ice cream...,0.0,7.0,93.0,0.494803,0
3,Gonna sustain a healthy diet over my days off ...,1.0,46.0,43.0,1.091426,1
4,i need the trans of what chanyeol said in the ...,0.0,66.0,753.0,0.355813,0
...,...,...,...,...,...,...
6065,Vanilla hulk from smoothie king never disappoi...,1.0,125.0,243.0,0.868310,1
6066,Finally going to eat my instant ramen.....I wa...,1.0,30.0,79.0,1.010513,1
6067,Been up since 7 am everyone else still sleep…....,0.0,85.0,371.0,0.406460,0
6068,NOOOOOO I meant to buy a six pack of diet soda...,1.0,42.0,131.0,0.946309,1


Index(['X'], dtype='object')
              precision    recall  f1-score   support

     class 0       0.93      0.93      0.93      1342
     class 1       0.74      0.87      0.80      1128
     class 2       0.70      0.47      0.56       565

    accuracy                           0.82      3035
   macro avg       0.79      0.75      0.76      3035
weighted avg       0.82      0.82      0.81      3035

[[1243   90    9]
 [  43  979  106]
 [  50  250  265]]
The language of "This is a language detection test." is "class 0"
The language of "Ceci est un test de détection de la langue." is "class 2"
The language of "Dies ist ein Test, um die Sprache zu erkennen." is "class 1"
