In [None]:
!pip install transformers

In [None]:
!pip install snowballstemmer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import warnings
warnings.filterwarnings("ignore")
import transformers
import torch

import copy
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch import nn, optim
from torch.utils import data



import re, string, unicodedata
import nltk
import inflect
from nltk import word_tokenize, sent_tokenize

%matplotlib inline
%config Inline.Backend.figure_formats='retina'

sns.set(style='whitegrid', palette = 'muted', font_scale=1.2)

rcParams['figure.figsize'] = 12,8

RANDOM_SEED = 42 
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

In [None]:
#df = pd.read_csv('full_data.csv',error_bad_lines = False,encoding='utf-8',engine='python')
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/full_data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info

In [None]:
df = df.drop_duplicates()

In [None]:
df.shape

In [None]:
df = df.drop(columns=['timestamp','unnamed_0'])

In [None]:
df_milligazete = df[df['brand']=='Milli Gazete']
df_milligazete['length'] = df_milligazete['text'].str.len()
df_milligazete.sort_values('length', ascending=False, inplace=True)
df_milligazete = df_milligazete.groupby('title').text.agg('max').reset_index()
df_milligazete["brand"] = "Milli Gazete"
df = df.drop(df[(df.brand=='Milli Gazete')].index)
df_list = [df,df_milligazete]
df = pd.concat(df_list)

In [None]:
df.shape

In [None]:
stop_words =pd.read_excel('turkish_stop.xlsx')  
stop_words = stop_words['word'].tolist()

In [None]:
df = df.fillna("")

In [None]:
docs = np.array(df['text'])


In [None]:
import itertools 
def grouper(n, iterable):
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, n))
        if not chunk:
            return
        yield chunk

In [None]:
import string
def translate(doc):
  new_list = []
  tr2eng = str.maketrans("çğıöşüÇĞİÖÜ", "cgiosuCGIOU")
  for item in doc:
      new_item = item.translate(tr2eng)
      new_list.append(new_item)
  return new_list
    
    

In [None]:
group = grouper(100,docs)

In [None]:
#def clean(doc):
#    punc_free = []
#    stop_free = " ".join([i for i in str(doc).lower().split() if i not in stop_words])
#    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
#    digit_free = re.sub("\d+", " ", punc_free)
    
#    return digit_free
import nltk
import numpy as np
WPT = nltk.WordPunctTokenizer()

def norm_doc(single_doc):
    
    # Remove special characters and numbers
    single_doc = re.sub(r"([^a-zA-Z ]+?)", " ", single_doc)
    pattern = r"[{}]".format(",.;") 
    single_doc = re.sub(pattern, "", single_doc) 
    #  Convert document to lowercase
    single_doc = single_doc.lower()
    single_doc = single_doc.strip()
    #  Tokenize documents
    tokens = WPT.tokenize(single_doc) 
    # EN: Filter out the stop-words 

    filtered_tokens = [token for token in tokens if token not in stop_words]
  
    #Reconstruct the document
    single_doc = ' '.join(filtered_tokens)
    return single_doc

norm_docs = np.vectorize(norm_doc) #like magic :)
normalized_documents = []
while True:
    try:
        cleaned = next(group)
        normalized_documents.append(norm_docs(translate(cleaned)))
        
    except StopIteration:
        break
    #print(group)
#normalized_documents = norm_docs(docs_1[:1000])
#print(normalized_documents)
  

In [None]:
data = []
for doc in normalized_documents:
  for item in doc:

    strings = item.split(",")
    data.append(strings)

In [None]:
len(data)

In [None]:
data

In [None]:
new_row = []
for item in data:
  for i in item:
    new_row.append(i)

In [None]:
df["cleaned_text"] = new_row

In [None]:
df.head()

In [None]:
df.to_csv('cleaned_data.csv')

In [None]:
import locale
from datetime import datetime

def convert_datetime(value):
    
    #locale.setlocale(locale.LC_ALL, "tr_TR")
    formats = ['%d %b %Y - %H:%M-', '%H:%M %d.%m.%Y', '%Y-%m-%d', '%d.%m.%Y - %H:%M','/%Y/%m/%d','%d-%m-%Y']
    result_format = '%d-%m-%Y'
    dt_obj = ""
    for dt_format in formats:
        
        try:
            dt_obj = datetime.strptime(value, dt_format).date()
         
            final_output =  datetime.strftime(dt_obj, "%d-%m-%Y")
            
         
            #return dt_obj.strftime(result_format)
        except ValueError:  # throws exception when format doesn't match
            continue
    return dt_obj  # let it be if it doesn't match



In [None]:
df = df.fillna("")
df.loc[:,"date"] = df.date.apply(lambda x: convert_datetime(x))
df['datetime'] = pd.to_datetime(df['date'])
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('datetime')

In [None]:
df_milligazete = df[df['brand'] == 'Milli Gazete']
df_sabah = df[df['brand'] == 'Sabah']
df_sputnik = df[df['brand'] == 'Sputnik']
df_hurriyet = df[df['brand'] == 'Hürriyet']

In [None]:
df.head()

In [None]:
corpus=[]
new= df['cleaned_text'].str.split()
new=new.values.tolist()
corpus=[word for i in new for word in i]

from collections import defaultdict
dic=defaultdict(int)
for word in corpus:
    if word in stop_words:
        dic[word]+=1

In [None]:
corpus

In [None]:
from collections import Counter
counter=Counter(corpus)
most=counter.most_common()

x, y= [], []
for word,count in most[:40]:
    if (word not in stop_words):
        x.append(word)
        y.append(count)
        
sns.barplot(x=y,y=x)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.fit_transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) 
                  for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:10]

In [None]:
top_n_bigrams=get_top_ngram(df['cleaned_text'],2)[:10]
x,y=map(list,zip(*top_n_bigrams))
sns.barplot(x=y,y=x)


In [None]:
top_tri_grams=get_top_ngram(df['cleaned_text'],n=3)
x,y=map(list,zip(*top_tri_grams))
sns.barplot(x=y,y=x)

In [None]:
flat_list = []

for sublist in normalized_documents:
    for item in sublist:
        flat_list.append(item)


In [None]:
len(flat_list)

In [None]:
from gensim.corpora import Dictionary
import gensim
BoW_Vector = CountVectorizer(min_df = 0., max_df = 1.)
BoW_Matrix = BoW_Vector.fit_transform(flat_list)
print(BoW_Matrix)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
number_of_topics = 4
BoW_Matrix = BoW_Vector.fit_transform(flat_list)
LDA = LatentDirichletAllocation(n_components = number_of_topics, 
                                max_iter = 10, 
                                learning_offset = 50.,
                                random_state = 0,
                                learning_method = 'online').fit(BoW_Matrix)
features = BoW_Vector.get_feature_names()
for t_id, topic in enumerate(LDA.components_):
    print ("Topic %d:" % (t_id))
    print (" ".join([features[i]
          for i in topic.argsort()[:-number_of_topics - 1:-1]]))

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased",truncated=True)

model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
sa= pipeline("sentiment-analysis", tokenizer=tokenizer, model=model)

In [None]:
def sentiment_score(doc):
    
    #try:
    
    if len(doc) > 512:
        #print(doc)
        
        n = 512
        chunks = [doc[i:i+n] for i in range(0, len(doc), n)]
        scores = []
        for chunk in chunks:
            
            
            
            p = sa(chunk)
            #print(p)
            if p[0]:
                if p[0]['label'] == 'negative':
                    score =  - p[0]['score']
                    scores.append(score)
                else:
                    score = p[0]['score']
                    scores.append(score)
        #print(np.mean(scores))
        return np.mean(scores)
    else:
        p = sa(doc)
        score = 0
        if p[0]['label'] == 'negative':
            score =  - p[0]['score']
        else:
            score = p[0]['score']
          
        return score
    #except:
     #   return 0


In [None]:
#df.loc[:,"sentiment_score"] = df.cleaned_text.apply(lambda x: sentiment_score(x))
df_sabah = df_sabah.fillna("")
df_hurriyet = df_sabah.fillna("")
df_milligazete = df_sabah.fillna("")
df_sputnik = df_sabah.fillna("")
df_sabah.loc[:,"sentiment_score"] = df_sabah.cleaned_text.apply(lambda x: sentiment_score(x))
print('finished')


In [None]:
df_hurriyet.loc[:,"sentiment_score"] = df_hurriyet.cleaned_text.apply(lambda x: sentiment_score(x))
print('finished')


In [None]:
df_milligazete2 = df_milligazete[:100]

In [None]:
df_milligazete2.loc[:,"sentiment_score"] = df_milligazete2.cleaned_text.apply(lambda x: sentiment_score(x))
print('finished')

In [None]:
df_milligazete2.head()

In [None]:
df_milligazete.loc[:,"sentiment_score"] = df_milligazete.cleaned_text.apply(lambda x: sentiment_score(x))
print('finished')

In [None]:
df_milligazete.to_csv("milli_gazete_sent_scores.csv")

In [None]:
df_sputnik.loc[:,"sentiment_score"] = df_sputnik.cleaned_text.apply(lambda x: sentiment_score(x))
print('finished')

In [None]:
df_milligazete.head()

In [None]:
df = pd.concat[df_sabah,df_milligazete,df_hurriyet,df_sputnik ]
df.to_csv('sentiment_scores.csv')
#df.head()

In [None]:
df['sentiment_score'].hist()

In [None]:
def sentiment(x):
    if x<0:
        return 'neg'
    elif x==0:
        return 'neu'
    else:
        return 'pos'
    
df['sentiment']=df['sentiment_score'].\
   map(lambda x: sentiment(x))

plt.bar(df.sentiment.value_counts().index,
        df.sentiment.value_counts())

In [None]:
df[df['sentiment']=='pos']['cleaned_text'].head()

In [None]:
df[df['sentiment']=='neg']['cleaned_text'].head()

In [None]:
plt.plot(df_hurriyet.index, df_hurriyet['sentiment_score'])
plt.title('Sentiment Scores Over Time')
plt.ylabel('Sentiment Scores');
plt.show()

In [None]:
df[["sentiment_score"]].resample("M").median().plot(figsize=(15,4))

In [None]:
df_hurriyet_q1 = df_hurriyet[(df_hurriyet['date'] > '2015-06-01 ') & (df_hurriyet['date'] <= '2015-10-01 ')]

In [None]:
df_hurriyet_q1 = df_hurriyet_q1.set_index('date')
df_hurriyet_q1.index = pd.to_datetime(df_hurriyet_q1.index)

In [None]:
df_hurriyet.plot(y=["sentiment_score"], figsize=(15,4))

In [None]:

df_hurriyet_q1[["sentiment_score"]].resample('MS', loffset=pd.Timedelta(14, 'd')).mean().plot(figsize=(15,4))