/
cduplicatefinder.cpp
105 lines (78 loc) · 2.99 KB
/
cduplicatefinder.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#include "cduplicatefinder.h"
#include <iostream>
#include "duplicatealgos.h"
CDuplicateFinder::CDuplicateFinder(std::vector<CFileEntry>&& dups) : m_duplicates(std::move(dups))
{}
CDuplicateFinder::~CDuplicateFinder()
{}
void CDuplicateFinder::doWork(){
if(!std::is_sorted(m_duplicates.begin(), m_duplicates.end(),
[](const CFileEntry& lhs, const CFileEntry& rhs){
return lhs.m_filesize < rhs.m_filesize;})){
sortFilesize();
}
findDuplicatesFileSize(m_duplicates.begin());
std::vector<CFileEntry>::iterator itr_hashed_end = partitionHash();
sortHashes(itr_hashed_end);
findHashDuplicates(itr_hashed_end);
}
void CDuplicateFinder::sortFilesize(){
std::sort(m_duplicates.begin(), m_duplicates.end(),
[](const auto& lhs, const auto& rhs){
return lhs.m_filesize < rhs.m_filesize;
});
}
void CDuplicateFinder::findDuplicatesFileSize(std::vector<CFileEntry>::iterator itr){
auto itr_end = m_duplicates.end();
auto itr_dup = std::adjacent_find(itr,
itr_end,
[](const auto& lhs, const auto& rhs) {
return lhs.m_filesize == rhs.m_filesize;
});
if (itr_dup != itr_end) {
//Found a potential duplicate range
auto itr_base = itr_dup;
//Potential duplicates get prepared for a deep comparision
while (itr_dup != itr_end && itr_base->m_filesize == itr_dup->m_filesize) {
itr_dup->createHash();
++itr_dup;
}
//check for more dups
if (itr_dup != itr_end) {
findDuplicatesFileSize(itr_dup);
}
}
}
std::vector<CFileEntry>::iterator CDuplicateFinder::partitionHash(){
//create a range of hot duplicate candidates
return std::stable_partition(m_duplicates.begin(),
m_duplicates.end(),
[](const auto& element){
return !element.m_hash.empty();});
}
void CDuplicateFinder::sortHashes(std::vector<CFileEntry>::iterator itr_hashed_end){
//sorting the hot duplicate candidates
std::sort(m_duplicates.begin(),
itr_hashed_end,
[](const auto& lhs, const auto& rhs){
return lhs.m_hash < rhs.m_hash;
});
}
void CDuplicateFinder::findHashDuplicates(std::vector<CFileEntry>::iterator itr_hashed_end){
std::pair<std::vector<CFileEntry>::iterator, std::vector<CFileEntry>::iterator> range;
//defined in duplicatealgos.h
range = duplicateRange(m_duplicates.begin(),
itr_hashed_end);
//check for a valid range
//don't invalidate the iterators => copy elements
while(range.first != itr_hashed_end){
std::copy(range.first,
range.second,
std::back_inserter(m_results));
range = duplicateRange(range.second,
itr_hashed_end);
}
}
std::vector<CFileEntry>&& CDuplicateFinder::results(){
return std::move(m_results);
}