In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class FocalLoss(nn.Module):
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha,(float,int)): self.alpha = torch.Tensor([alpha,1-alpha])
        if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0),input.size(1),-1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1,2)    # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1,input.size(2))   # N,H*W,C => N*H*W,C
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())

        if self.alpha is not None:
            if self.alpha.type()!=input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0,target.data.view(-1))
            logpt = logpt * Variable(at)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

In [None]:
# pip install transformers
# pip  install openpyxl
import torch
from transformers import BertTokenizer
from IPython.display import clear_output
import os
import pandas as pd
import re
import random
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertForSequenceClassification
import gc
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from copy import deepcopy

PRETRAINED_MODEL_NAME = "bert-base-uncased"  
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
clear_output()
print("PyTorch 版本：", torch.__version__)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

df_train = pd.read_csv('../input/8888888888888/train.csv')

df_train = df_train.loc[:, ['text', 'target']]
df_train.columns = ['text_a', 'label']

seed = 696969
random.seed(seed)
random.shuffle(df_train.loc[:,'text_a'])
random.seed(seed)
random.shuffle(df_train.loc[:,'label'])
df_train=df_train.reset_index()

d={'label':df_train['label'].value_counts().index,'count':df_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_train = df_train.loc[:, ['text_a', 'label']][:7000]
print("train樣本數：", len(df_train_train))
df_train_train.to_csv("train.tsv", sep="\t", index=False) 

d={'label':df_train_train['label'].value_counts().index,'count':df_train_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_test = df_train.loc[:, ['text_a', 'label']][7000:]
print("test樣本數：", len(df_train_test))
df_train_test.to_csv("testt.tsv", sep="\t", index=False)

d={'label':df_train_test['label'].value_counts().index,'count':df_train_test['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

class FakeNewsDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "testt", "submit"] 
        self.mode = mode
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {0:0,1:1}
        self.tokenizer = tokenizer  

    def __getitem__(self, idx):# 定義回傳一筆訓練 / 測試數據的函式
        if self.mode == "submit":
            text_a, id= self.df.iloc[idx,:].values
            label_tensor = None
        else:
            text_a,label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
         
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        segments_tensor = torch.tensor([0] * len_a ,dtype=torch.long)
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    tokens_tensors = pad_sequence(tokens_tensors,batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0,1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

def get_predictions(model, dataloader, compute_acc=False):
    model.eval()
    predictions = None
    correct = 0
    total = 0
    labelss = None
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                if labelss is None:
                    labelss = data[3]
                else:
                    labelss = torch.cat((labelss, data[3]))
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
            
    if compute_acc:
        acc = correct / total
        return f1_score(predictions.data.cpu().numpy(), labelss.data.cpu().numpy()), acc
    
    return predictions

NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model2=model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
model2 = model2.to(device)

trainset = FakeNewsDataset("train", tokenizer=tokenizer)
trainloader = DataLoader(trainset, batch_size=32, collate_fn=create_mini_batch,shuffle=True)

testset = FakeNewsDataset("testt", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, collate_fn=create_mini_batch)

# model.train() # 訓練模式

optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) 

EPOCHS = 13
record_max=0
record_max_f1_score=0
e=0

for epoch in range(EPOCHS):
    
    running_loss = 0.0

    for data in trainloader:
        model.train()
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

        optimizer.zero_grad() 

        outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors,labels=labels)
        loss=outputs[0]
        #     loss_fct = FocalLoss()
        #     loss = loss_fct(outputs[0], labels)     

        loss.backward()
        optimizer.step()

        running_loss += loss.item() 

    f1_train, acc = get_predictions(model, trainloader, compute_acc=True) 
    f1_test, tacc = get_predictions(model, testloader, compute_acc=True) 

    torch.save(deepcopy(model.state_dict()), str(epoch)+".pt")

    if(record_max_f1_score<f1_test):
        e=epoch
        record_max_f1_score=f1_test
    if(record_max<tacc):
        record_max=tacc

    print("epoch:",epoch,"f1_score:",f1_test,"f1_score_max:",record_max_f1_score,"\n") 

# model2 = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model.load_state_dict(torch.load(str(e)+".pt"))
model.eval()
pt, tacc = get_predictions(model, testloader, compute_acc=True)
print(tacc,e)

df_submit = pd.read_csv("../input/8888888888888/test.csv")
df_submit = df_submit.loc[:,['text','id']]
df_submit.columns = ["text_a","id"]

df_submit.to_csv("submit.tsv", sep="\t", index=False)
print(df_submit)

submitset = FakeNewsDataset("submit", tokenizer=tokenizer)
submitloader = DataLoader(submitset, batch_size=1, collate_fn=create_mini_batch)

predictions = get_predictions(model, submitloader)

index_map = {v: k for k, v in testset.label_map.items()}

print(e)

df = pd.DataFrame({"target": predictions.tolist()})
df['target'] = df.target.apply(lambda x: index_map[x])
df_pred = pd.concat([submitset.df.loc[:, ["id"]], df.loc[:, 'target']], axis=1)
df_pred.to_csv('kkkkkk'+str(e)+'.csv', index=False)


In [None]:
# pip install transformers
# pip  install openpyxl
import torch
from transformers import BertTokenizer
from IPython.display import clear_output
import os
import pandas as pd
import re
import random
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertForSequenceClassification
import gc
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from copy import deepcopy

PRETRAINED_MODEL_NAME = "bert-base-uncased"  
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
clear_output()
print("PyTorch 版本：", torch.__version__)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

df_train = pd.read_csv('../input/8888888888888/train.csv')

df_train = df_train.loc[:, ['text', 'target']]
df_train.columns = ['text_a', 'label']

seed = 69696915516
random.seed(seed)
random.shuffle(df_train.loc[:,'text_a'])
random.seed(seed)
random.shuffle(df_train.loc[:,'label'])
df_train=df_train.reset_index()

d={'label':df_train['label'].value_counts().index,'count':df_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_train = df_train.loc[:, ['text_a', 'label']][:7000]
print("train樣本數：", len(df_train_train))
df_train_train.to_csv("train.tsv", sep="\t", index=False) 

d={'label':df_train_train['label'].value_counts().index,'count':df_train_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_test = df_train.loc[:, ['text_a', 'label']][7000:]
print("test樣本數：", len(df_train_test))
df_train_test.to_csv("testt.tsv", sep="\t", index=False)

d={'label':df_train_test['label'].value_counts().index,'count':df_train_test['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

class FakeNewsDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "testt", "submit"] 
        self.mode = mode
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {0:0,1:1}
        self.tokenizer = tokenizer  

    def __getitem__(self, idx):# 定義回傳一筆訓練 / 測試數據的函式
        if self.mode == "submit":
            text_a, id= self.df.iloc[idx,:].values
            label_tensor = None
        else:
            text_a,label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
         
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        segments_tensor = torch.tensor([0] * len_a ,dtype=torch.long)
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    tokens_tensors = pad_sequence(tokens_tensors,batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0,1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

def get_predictions(model, dataloader, compute_acc=False):
    model.eval()
    predictions = None
    correct = 0
    total = 0
    labelss = None
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                if labelss is None:
                    labelss = data[3]
                else:
                    labelss = torch.cat((labelss, data[3]))
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
            
    if compute_acc:
        acc = correct / total
        return f1_score(predictions.data.cpu().numpy(), labelss.data.cpu().numpy()), acc
    
    return predictions

NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model2=model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
model2 = model2.to(device)

trainset = FakeNewsDataset("train", tokenizer=tokenizer)
trainloader = DataLoader(trainset, batch_size=32, collate_fn=create_mini_batch,shuffle=True)

testset = FakeNewsDataset("testt", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, collate_fn=create_mini_batch)

# model.train() # 訓練模式

optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) 

EPOCHS = 13
record_max=0
record_max_f1_score=0
e=0

for epoch in range(EPOCHS):
    
    running_loss = 0.0

    for data in trainloader:
        model.train()
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

        optimizer.zero_grad() 

        outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors,labels=labels)
        loss=outputs[0]
        #     loss_fct = FocalLoss()
        #     loss = loss_fct(outputs[0], labels)     

        loss.backward()
        optimizer.step()

        running_loss += loss.item() 

    f1_train, acc = get_predictions(model, trainloader, compute_acc=True) 
    f1_test, tacc = get_predictions(model, testloader, compute_acc=True) 

    torch.save(deepcopy(model.state_dict()), str(epoch)+".pt")

    if(record_max_f1_score<f1_test):
        e=epoch
        record_max_f1_score=f1_test
    if(record_max<tacc):
        record_max=tacc

    print("epoch:",epoch,"f1_score:",f1_test,"f1_score_max:",record_max_f1_score,"\n") 

# model2 = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model.load_state_dict(torch.load(str(e)+".pt"))
model.eval()
pt, tacc = get_predictions(model, testloader, compute_acc=True)
print(tacc,e)

df_submit = pd.read_csv("../input/8888888888888/test.csv")
df_submit = df_submit.loc[:,['text','id']]
df_submit.columns = ["text_a","id"]

df_submit.to_csv("submit.tsv", sep="\t", index=False)
print(df_submit)

submitset = FakeNewsDataset("submit", tokenizer=tokenizer)
submitloader = DataLoader(submitset, batch_size=1, collate_fn=create_mini_batch)

predictions = get_predictions(model, submitloader)

index_map = {v: k for k, v in testset.label_map.items()}

print(e)

df = pd.DataFrame({"target": predictions.tolist()})
df['target'] = df.target.apply(lambda x: index_map[x])
df_pred = pd.concat([submitset.df.loc[:, ["id"]], df.loc[:, 'target']], axis=1)
df_pred.to_csv('kkkkkk'+str(e)+'.csv', index=False)


In [None]:
# pip install transformers
# pip  install openpyxl
import torch
from transformers import BertTokenizer
from IPython.display import clear_output
import os
import pandas as pd
import re
import random
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertForSequenceClassification
import gc
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from copy import deepcopy

PRETRAINED_MODEL_NAME = "bert-base-uncased"  
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
clear_output()
print("PyTorch 版本：", torch.__version__)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

df_train = pd.read_csv('../input/8888888888888/train.csv')

df_train = df_train.loc[:, ['text', 'target']]
df_train.columns = ['text_a', 'label']

seed = 66669999
random.seed(seed)
random.shuffle(df_train.loc[:,'text_a'])
random.seed(seed)
random.shuffle(df_train.loc[:,'label'])
df_train=df_train.reset_index()

d={'label':df_train['label'].value_counts().index,'count':df_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_train = df_train.loc[:, ['text_a', 'label']][:7000]
print("train樣本數：", len(df_train_train))
df_train_train.to_csv("train.tsv", sep="\t", index=False) 

d={'label':df_train_train['label'].value_counts().index,'count':df_train_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_test = df_train.loc[:, ['text_a', 'label']][7000:]
print("test樣本數：", len(df_train_test))
df_train_test.to_csv("testt.tsv", sep="\t", index=False)

d={'label':df_train_test['label'].value_counts().index,'count':df_train_test['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

class FakeNewsDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "testt", "submit"] 
        self.mode = mode
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {0:0,1:1}
        self.tokenizer = tokenizer  

    def __getitem__(self, idx):# 定義回傳一筆訓練 / 測試數據的函式
        if self.mode == "submit":
            text_a, id= self.df.iloc[idx,:].values
            label_tensor = None
        else:
            text_a,label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
         
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        segments_tensor = torch.tensor([0] * len_a ,dtype=torch.long)
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    tokens_tensors = pad_sequence(tokens_tensors,batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0,1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

def get_predictions(model, dataloader, compute_acc=False):
    model.eval()
    predictions = None
    correct = 0
    total = 0
    labelss = None
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                if labelss is None:
                    labelss = data[3]
                else:
                    labelss = torch.cat((labelss, data[3]))
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
            
    if compute_acc:
        acc = correct / total
        return f1_score(predictions.data.cpu().numpy(), labelss.data.cpu().numpy()), acc
    
    return predictions

NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model2=model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
model2 = model2.to(device)

trainset = FakeNewsDataset("train", tokenizer=tokenizer)
trainloader = DataLoader(trainset, batch_size=32, collate_fn=create_mini_batch,shuffle=True)

testset = FakeNewsDataset("testt", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, collate_fn=create_mini_batch)

# model.train() # 訓練模式

optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) 

EPOCHS = 13
record_max=0
record_max_f1_score=0
e=0

for epoch in range(EPOCHS):
    
    running_loss = 0.0

    for data in trainloader:
        model.train()
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

        optimizer.zero_grad() 

        outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors,labels=labels)
        loss=outputs[0]
        #     loss_fct = FocalLoss()
        #     loss = loss_fct(outputs[0], labels)     

        loss.backward()
        optimizer.step()

        running_loss += loss.item() 

    f1_train, acc = get_predictions(model, trainloader, compute_acc=True) 
    f1_test, tacc = get_predictions(model, testloader, compute_acc=True) 

    torch.save(deepcopy(model.state_dict()), str(epoch)+".pt")

    if(record_max_f1_score<f1_test):
        e=epoch
        record_max_f1_score=f1_test
    if(record_max<tacc):
        record_max=tacc

    print("epoch:",epoch,"f1_score:",f1_test,"f1_score_max:",record_max_f1_score,"\n") 

# model2 = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model.load_state_dict(torch.load(str(e)+".pt"))
model.eval()
pt, tacc = get_predictions(model, testloader, compute_acc=True)
print(tacc,e)

df_submit = pd.read_csv("../input/8888888888888/test.csv")
df_submit = df_submit.loc[:,['text','id']]
df_submit.columns = ["text_a","id"]

df_submit.to_csv("submit.tsv", sep="\t", index=False)
print(df_submit)

submitset = FakeNewsDataset("submit", tokenizer=tokenizer)
submitloader = DataLoader(submitset, batch_size=1, collate_fn=create_mini_batch)

predictions = get_predictions(model, submitloader)

index_map = {v: k for k, v in testset.label_map.items()}

print(e)

df = pd.DataFrame({"target": predictions.tolist()})
df['target'] = df.target.apply(lambda x: index_map[x])
df_pred = pd.concat([submitset.df.loc[:, ["id"]], df.loc[:, 'target']], axis=1)
df_pred.to_csv('kkkkkk'+str(e)+'.csv', index=False)


In [None]:
# pip install transformers
# pip  install openpyxl
import torch
from transformers import BertTokenizer
from IPython.display import clear_output
import os
import pandas as pd
import re
import random
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertForSequenceClassification
import gc
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from copy import deepcopy

PRETRAINED_MODEL_NAME = "bert-base-uncased"  
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
clear_output()
print("PyTorch 版本：", torch.__version__)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

df_train = pd.read_csv('../input/8888888888888/train.csv')

df_train = df_train.loc[:, ['text', 'target']]
df_train.columns = ['text_a', 'label']

seed = 111111
random.seed(seed)
random.shuffle(df_train.loc[:,'text_a'])
random.seed(seed)
random.shuffle(df_train.loc[:,'label'])
df_train=df_train.reset_index()

d={'label':df_train['label'].value_counts().index,'count':df_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_train = df_train.loc[:, ['text_a', 'label']][:7000]
print("train樣本數：", len(df_train_train))
df_train_train.to_csv("train.tsv", sep="\t", index=False) 

d={'label':df_train_train['label'].value_counts().index,'count':df_train_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_test = df_train.loc[:, ['text_a', 'label']][7000:]
print("test樣本數：", len(df_train_test))
df_train_test.to_csv("testt.tsv", sep="\t", index=False)

d={'label':df_train_test['label'].value_counts().index,'count':df_train_test['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

class FakeNewsDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "testt", "submit"] 
        self.mode = mode
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {0:0,1:1}
        self.tokenizer = tokenizer  

    def __getitem__(self, idx):# 定義回傳一筆訓練 / 測試數據的函式
        if self.mode == "submit":
            text_a, id= self.df.iloc[idx,:].values
            label_tensor = None
        else:
            text_a,label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
         
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        segments_tensor = torch.tensor([0] * len_a ,dtype=torch.long)
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    tokens_tensors = pad_sequence(tokens_tensors,batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0,1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

def get_predictions(model, dataloader, compute_acc=False):
    model.eval()
    predictions = None
    correct = 0
    total = 0
    labelss = None
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                if labelss is None:
                    labelss = data[3]
                else:
                    labelss = torch.cat((labelss, data[3]))
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
            
    if compute_acc:
        acc = correct / total
        return f1_score(predictions.data.cpu().numpy(), labelss.data.cpu().numpy()), acc
    
    return predictions

NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model2=model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
model2 = model2.to(device)

trainset = FakeNewsDataset("train", tokenizer=tokenizer)
trainloader = DataLoader(trainset, batch_size=32, collate_fn=create_mini_batch,shuffle=True)

testset = FakeNewsDataset("testt", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, collate_fn=create_mini_batch)

# model.train() # 訓練模式

optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) 

EPOCHS = 20
record_max=0
record_max_f1_score=0
e=0

for epoch in range(EPOCHS):
    
    running_loss = 0.0

    for data in trainloader:
        model.train()
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

        optimizer.zero_grad() 

        outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors,labels=labels)
        loss=outputs[0]
        #     loss_fct = FocalLoss()
        #     loss = loss_fct(outputs[0], labels)     

        loss.backward()
        optimizer.step()

        running_loss += loss.item() 

    f1_train, acc = get_predictions(model, trainloader, compute_acc=True) 
    f1_test, tacc = get_predictions(model, testloader, compute_acc=True) 

    torch.save(deepcopy(model.state_dict()), str(epoch)+".pt")

    if(record_max_f1_score<f1_test):
        e=epoch
        record_max_f1_score=f1_test
    if(record_max<tacc):
        record_max=tacc

    print("epoch:",epoch,"f1_score:",f1_test,"f1_score_max:",record_max_f1_score,"\n") 

# model2 = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model.load_state_dict(torch.load(str(e)+".pt"))
model.eval()
pt, tacc = get_predictions(model, testloader, compute_acc=True)
print(pt,e)

df_submit = pd.read_csv("../input/8888888888888/test.csv")
df_submit = df_submit.loc[:,['text','id']]
df_submit.columns = ["text_a","id"]

df_submit.to_csv("submit.tsv", sep="\t", index=False)
print(df_submit)

submitset = FakeNewsDataset("submit", tokenizer=tokenizer)
submitloader = DataLoader(submitset, batch_size=1, collate_fn=create_mini_batch)

predictions = get_predictions(model, submitloader)

index_map = {v: k for k, v in testset.label_map.items()}

print(e)

df = pd.DataFrame({"target": predictions.tolist()})
df['target'] = df.target.apply(lambda x: index_map[x])
df_pred = pd.concat([submitset.df.loc[:, ["id"]], df.loc[:, 'target']], axis=1)
df_pred.to_csv('kkkkkk'+str(e)+'.csv', index=False)


In [None]:
# pip install transformers
# pip  install openpyxl
import torch
from transformers import BertTokenizer
from IPython.display import clear_output
import os
import pandas as pd
import re
import random
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertForSequenceClassification
import gc
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from copy import deepcopy

PRETRAINED_MODEL_NAME = "bert-base-uncased"  
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
clear_output()
print("PyTorch 版本：", torch.__version__)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

df_train = pd.read_csv('../input/8888888888888/train.csv')

df_train = df_train.loc[:, ['text', 'target']]
df_train.columns = ['text_a', 'label']

seed = 696969
random.seed(seed)
random.shuffle(df_train.loc[:,'text_a'])
random.seed(seed)
random.shuffle(df_train.loc[:,'label'])
df_train=df_train.reset_index()

d={'label':df_train['label'].value_counts().index,'count':df_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_train = df_train.loc[:, ['text_a', 'label']][:5000]
print("train樣本數：", len(df_train_train))
df_train_train.to_csv("train.tsv", sep="\t", index=False) 

d={'label':df_train_train['label'].value_counts().index,'count':df_train_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_test = df_train.loc[:, ['text_a', 'label']][5000:]
print("test樣本數：", len(df_train_test))
df_train_test.to_csv("testt.tsv", sep="\t", index=False)

d={'label':df_train_test['label'].value_counts().index,'count':df_train_test['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

class FakeNewsDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "testt", "submit"] 
        self.mode = mode
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {0:0,1:1}
        self.tokenizer = tokenizer  

    def __getitem__(self, idx):# 定義回傳一筆訓練 / 測試數據的函式
        if self.mode == "submit":
            text_a, id= self.df.iloc[idx,:].values
            label_tensor = None
        else:
            text_a,label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
         
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        segments_tensor = torch.tensor([0] * len_a ,dtype=torch.long)
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    tokens_tensors = pad_sequence(tokens_tensors,batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0,1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

def get_predictions(model, dataloader, compute_acc=False):
    model.eval()
    predictions = None
    correct = 0
    total = 0
    labelss = None
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                if labelss is None:
                    labelss = data[3]
                else:
                    labelss = torch.cat((labelss, data[3]))
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
            
    if compute_acc:
        acc = correct / total
        return f1_score(predictions.data.cpu().numpy(), labelss.data.cpu().numpy()), acc
    
    return predictions

NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model2=model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
model2 = model2.to(device)

trainset = FakeNewsDataset("train", tokenizer=tokenizer)
trainloader = DataLoader(trainset, batch_size=32, collate_fn=create_mini_batch,shuffle=True)

testset = FakeNewsDataset("testt", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, collate_fn=create_mini_batch)

# model.train() # 訓練模式

optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) 

EPOCHS = 20
record_max=0
record_max_f1_score=0
e=0

for epoch in range(EPOCHS):
    
    running_loss = 0.0

    for data in trainloader:
        model.train()
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

        optimizer.zero_grad() 

        outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors,labels=labels)
        loss=outputs[0]
        #     loss_fct = FocalLoss()
        #     loss = loss_fct(outputs[0], labels)     

        loss.backward()
        optimizer.step()

        running_loss += loss.item() 

    f1_train, acc = get_predictions(model, trainloader, compute_acc=True) 
    f1_test, tacc = get_predictions(model, testloader, compute_acc=True) 

    torch.save(deepcopy(model.state_dict()), str(epoch)+".pt")

    if(record_max_f1_score<f1_test):
        e=epoch
        record_max_f1_score=f1_test
    if(record_max<tacc):
        record_max=tacc

    print("epoch:",epoch,"f1_score:",f1_test,"f1_score_max:",record_max_f1_score,"\n") 

# model2 = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model.load_state_dict(torch.load(str(e)+".pt"))
model.eval()
pt, tacc = get_predictions(model, testloader, compute_acc=True)
print(tacc,e)

df_submit = pd.read_csv("../input/8888888888888/test.csv")
df_submit = df_submit.loc[:,['text','id']]
df_submit.columns = ["text_a","id"]

df_submit.to_csv("submit.tsv", sep="\t", index=False)
print(df_submit)

submitset = FakeNewsDataset("submit", tokenizer=tokenizer)
submitloader = DataLoader(submitset, batch_size=1, collate_fn=create_mini_batch)

predictions = get_predictions(model, submitloader)

index_map = {v: k for k, v in testset.label_map.items()}

print(e)

df = pd.DataFrame({"target": predictions.tolist()})
df['target'] = df.target.apply(lambda x: index_map[x])
df_pred = pd.concat([submitset.df.loc[:, ["id"]], df.loc[:, 'target']], axis=1)
df_pred.to_csv('kkkkkk'+str(e)+'.csv', index=False)


In [None]:
# pip install transformers
# pip  install openpyxl
import torch
from transformers import BertTokenizer
from IPython.display import clear_output
import os
import pandas as pd
import re
import random
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertForSequenceClassification
import gc
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from copy import deepcopy

PRETRAINED_MODEL_NAME = "bert-base-uncased"  
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
clear_output()
print("PyTorch 版本：", torch.__version__)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

df_train = pd.read_csv('../input/8888888888888/train.csv')

df_train = df_train.loc[:, ['text', 'target']]
df_train.columns = ['text_a', 'label']

seed = 696969
random.seed(seed)
random.shuffle(df_train.loc[:,'text_a'])
random.seed(seed)
random.shuffle(df_train.loc[:,'label'])
df_train=df_train.reset_index()

d={'label':df_train['label'].value_counts().index,'count':df_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_train = df_train.loc[:, ['text_a', 'label']][:7550]
print("train樣本數：", len(df_train_train))
df_train_train.to_csv("train.tsv", sep="\t", index=False) 

d={'label':df_train_train['label'].value_counts().index,'count':df_train_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_test = df_train.loc[:, ['text_a', 'label']][7550:]
print("test樣本數：", len(df_train_test))
df_train_test.to_csv("testt.tsv", sep="\t", index=False)

d={'label':df_train_test['label'].value_counts().index,'count':df_train_test['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

class FakeNewsDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "testt", "submit"] 
        self.mode = mode
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {0:0,1:1}
        self.tokenizer = tokenizer  

    def __getitem__(self, idx):# 定義回傳一筆訓練 / 測試數據的函式
        if self.mode == "submit":
            text_a, id= self.df.iloc[idx,:].values
            label_tensor = None
        else:
            text_a,label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
         
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        segments_tensor = torch.tensor([0] * len_a ,dtype=torch.long)
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    tokens_tensors = pad_sequence(tokens_tensors,batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0,1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

def get_predictions(model, dataloader, compute_acc=False):
    model.eval()
    predictions = None
    correct = 0
    total = 0
    labelss = None
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                if labelss is None:
                    labelss = data[3]
                else:
                    labelss = torch.cat((labelss, data[3]))
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
            
    if compute_acc:
        acc = correct / total
        return f1_score(predictions.data.cpu().numpy(), labelss.data.cpu().numpy()), acc
    
    return predictions

NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model2=model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
model2 = model2.to(device)

trainset = FakeNewsDataset("train", tokenizer=tokenizer)
trainloader = DataLoader(trainset, batch_size=32, collate_fn=create_mini_batch,shuffle=True)

testset = FakeNewsDataset("testt", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, collate_fn=create_mini_batch)

# model.train() # 訓練模式

optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) 

EPOCHS = 15
record_max=0
record_max_f1_score=0
e=0

for epoch in range(EPOCHS):
    
    running_loss = 0.0

    for data in trainloader:
        model.train()
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

        optimizer.zero_grad() 

        outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors,labels=labels)
        loss=outputs[0]
        #     loss_fct = FocalLoss()
        #     loss = loss_fct(outputs[0], labels)     

        loss.backward()
        optimizer.step()

        running_loss += loss.item() 

    f1_train, acc = get_predictions(model, trainloader, compute_acc=True) 
    f1_test, tacc = get_predictions(model, testloader, compute_acc=True) 

    torch.save(deepcopy(model.state_dict()), str(epoch)+".pt")

    if(record_max_f1_score<f1_test):
        e=epoch
        record_max_f1_score=f1_test
    if(record_max<tacc):
        record_max=tacc

    print("epoch:",epoch,"f1_score:",f1_test,"f1_score_max:",record_max_f1_score,"\n") 

# model2 = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model.load_state_dict(torch.load(str(e)+".pt"))
model.eval()
pt, tacc = get_predictions(model, testloader, compute_acc=True)
print(tacc,e)

df_submit = pd.read_csv("../input/8888888888888/test.csv")
df_submit = df_submit.loc[:,['text','id']]
df_submit.columns = ["text_a","id"]

df_submit.to_csv("submit.tsv", sep="\t", index=False)
print(df_submit)

submitset = FakeNewsDataset("submit", tokenizer=tokenizer)
submitloader = DataLoader(submitset, batch_size=1, collate_fn=create_mini_batch)

predictions = get_predictions(model, submitloader)

index_map = {v: k for k, v in testset.label_map.items()}

print(e)

df = pd.DataFrame({"target": predictions.tolist()})
df['target'] = df.target.apply(lambda x: index_map[x])
df_pred = pd.concat([submitset.df.loc[:, ["id"]], df.loc[:, 'target']], axis=1)
df_pred.to_csv('kkkkkk'+str(e)+'.csv', index=False)


In [None]:
# pip install transformers
# pip  install openpyxl
import torch
from transformers import BertTokenizer
from IPython.display import clear_output
import os
import pandas as pd
import re
import random
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertForSequenceClassification
import gc
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from copy import deepcopy

PRETRAINED_MODEL_NAME = "bert-base-uncased"  
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
clear_output()
print("PyTorch 版本：", torch.__version__)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

df_train = pd.read_csv('../input/8888888888888/train.csv')

df_train = df_train.loc[:, ['text', 'target']]
df_train.columns = ['text_a', 'label']

seed = 696969
random.seed(seed)
random.shuffle(df_train.loc[:,'text_a'])
random.seed(seed)
random.shuffle(df_train.loc[:,'label'])
df_train=df_train.reset_index()

d={'label':df_train['label'].value_counts().index,'count':df_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_train = df_train.loc[:, ['text_a', 'label']][:5500]
print("train樣本數：", len(df_train_train))
df_train_train.to_csv("train.tsv", sep="\t", index=False) 

d={'label':df_train_train['label'].value_counts().index,'count':df_train_train['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

df_train_test = df_train.loc[:, ['text_a', 'label']][5500:]
print("test樣本數：", len(df_train_test))
df_train_test.to_csv("testt.tsv", sep="\t", index=False)

d={'label':df_train_test['label'].value_counts().index,'count':df_train_test['label'].value_counts()}
df_cat=pd.DataFrame(data=d).reset_index(drop=True)
print(df_cat)

class FakeNewsDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "testt", "submit"] 
        self.mode = mode
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {0:0,1:1}
        self.tokenizer = tokenizer  

    def __getitem__(self, idx):# 定義回傳一筆訓練 / 測試數據的函式
        if self.mode == "submit":
            text_a, id= self.df.iloc[idx,:].values
            label_tensor = None
        else:
            text_a,label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
         
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        segments_tensor = torch.tensor([0] * len_a ,dtype=torch.long)
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    tokens_tensors = pad_sequence(tokens_tensors,batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0,1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

def get_predictions(model, dataloader, compute_acc=False):
    model.eval()
    predictions = None
    correct = 0
    total = 0
    labelss = None
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                if labelss is None:
                    labelss = data[3]
                else:
                    labelss = torch.cat((labelss, data[3]))
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
            
    if compute_acc:
        acc = correct / total
        return f1_score(predictions.data.cpu().numpy(), labelss.data.cpu().numpy()), acc
    
    return predictions

NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model2=model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
model2 = model2.to(device)

trainset = FakeNewsDataset("train", tokenizer=tokenizer)
trainloader = DataLoader(trainset, batch_size=32, collate_fn=create_mini_batch,shuffle=True)

testset = FakeNewsDataset("testt", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, collate_fn=create_mini_batch)

# model.train() # 訓練模式

optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) 

EPOCHS = 13
record_max=0
record_max_f1_score=0
e=0

for epoch in range(EPOCHS):
    
    running_loss = 0.0

    for data in trainloader:
        model.train()
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

        optimizer.zero_grad() 

        outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors,labels=labels)
        loss=outputs[0]
        #     loss_fct = FocalLoss()
        #     loss = loss_fct(outputs[0], labels)     

        loss.backward()
        optimizer.step()

        running_loss += loss.item() 

    f1_train, acc = get_predictions(model, trainloader, compute_acc=True) 
    f1_test, tacc = get_predictions(model, testloader, compute_acc=True) 

    torch.save(deepcopy(model.state_dict()), str(epoch)+".pt")

    if(record_max_f1_score<f1_test):
        e=epoch
        record_max_f1_score=f1_test
    if(record_max<tacc):
        record_max=tacc

    print("epoch:",epoch,"f1_score:",f1_test,"f1_score_max:",record_max_f1_score,"\n") 

# model2 = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
model.load_state_dict(torch.load(str(e)+".pt"))
model.eval()
pt, tacc = get_predictions(model, testloader, compute_acc=True)
print(tacc,e)

df_submit = pd.read_csv("../input/8888888888888/test.csv")
df_submit = df_submit.loc[:,['text','id']]
df_submit.columns = ["text_a","id"]

df_submit.to_csv("submit.tsv", sep="\t", index=False)
print(df_submit)

submitset = FakeNewsDataset("submit", tokenizer=tokenizer)
submitloader = DataLoader(submitset, batch_size=1, collate_fn=create_mini_batch)

predictions = get_predictions(model, submitloader)

index_map = {v: k for k, v in testset.label_map.items()}

print(e)

df = pd.DataFrame({"target": predictions.tolist()})
df['target'] = df.target.apply(lambda x: index_map[x])
df_pred = pd.concat([submitset.df.loc[:, ["id"]], df.loc[:, 'target']], axis=1)
df_pred.to_csv('kkkkkk'+str(e)+'.csv', index=False)
