In [None]:
import tweepy
from autenticacao import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import tree, naive_bayes
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

In [None]:
nltk.download('rslp')
nltk.download('punkt')

In [None]:
# autenticar no twitter
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)    


In [None]:
# tweets da timeline
for status in tweepy.Cursor(api.home_timeline).items(10):
    print(status.text)


In [None]:
# pesquisar tweets de um usuario
for status in tweepy.Cursor(api.user_timeline, id="twitter").items(10):
    # process status here
    print(status.text)

In [None]:
# buscar por termos
for status in tweepy.Cursor(api.search, q='eleições').items(10):
    # process status here
    print(status.text)

In [None]:
# monitorar tweets
class MyStreamListener(tweepy.StreamListener):

    def on_status(self, status):
        print(status.text)
        # salvar em um arquivo de texto
        with open('tweets.txt', 'a', encoding='utf8') as arq:
            arq.write(status.text)
            arq.write('\n')

    def on_error(self, status_code):
       
        print('Erro: ' + repr(status_code))
        
myStreamListener = MyStreamListener(api=tweepy.API(wait_on_rate_limit=True))
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)
try:
    # track - termos para monitorar
    myStream.filter(track=['eleições'])
except:
    pass

In [None]:
# ler base de treinamento
treinamento = pd.read_csv('Tweets_Mg.csv')

treinamento.head()

In [None]:
# distribuição das classes
treinamento.Classificacao.value_counts()


In [None]:
# processamento linguístico
# ler arquivo com stopwords e transforma em uma lista
stopwords = []
with open('stopwords.txt','r', encoding='utf-8') as s:
    stopwords = s.read().split('\n')
    
def stemmer(doc):
    stemmer = nltk.stem.RSLPStemmer()
    return ' '.join([stemmer.stem(t) for t in nltk.word_tokenize(doc)])


In [None]:
tfidf = TfidfVectorizer(binary=False,norm='l1', use_idf=True, token_pattern=r'\b[a-zA-Z]{3,}\b', stop_words=stopwords, preprocessor=stemmer)
vetor = tfidf.fit_transform(treinamento['Text'])
categorias = treinamento['Classificacao']

In [None]:
# Treinar um classificador
classif = naive_bayes.GaussianNB()
x_train, x_test, y_train, y_test = train_test_split(vetor, categorias, test_size = 0.3) 

classif.fit(x_train.toarray(), y_train)

y_predict = classif.predict(x_test.toarray())
acuracia = accuracy_score(y_test, y_predict)
acuracia    

In [None]:
#classificar novos tweets
# ler tweets coletados

teste = pd.read_json('tweets_pt.json', lines=True, encoding='utf8')

# aplicar tfidf no campo text
vetor_test = tfidf.transform(teste['text'])
y_test = classif.predict(vetor_test.toarray())

# criar dataframe com resultado
result = pd.DataFrame({'texto':teste['text'],'sentimento':y_test})
result.head()

In [None]:
# distribuição das classes
result.sentimento.value_counts()

In [None]:
# ver tweets positivo
result[result["sentimento"] == 'Negativo']